Use blinding strategy in hist()

e2792acf · Frank Sauerburger · 684ad551 · e2792acf · e2792acf
Verified Commit e2792acf authored 5 years ago by Frank Sauerburger
--- a/examples/Blinding.ipynb
+++ b/examples/Blinding.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "\n",
+    "from nnfwtbn import Variable, Process, Cut, hist, McStack, DataStack, Stack, \\\n",
+    "                    RangeBlindingStrategy\n",
+    "from nnfwtbn import toydata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = toydata.get()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Processes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p_ztt = Process(r\"$Z\\rightarrow\\tau\\tau$\", range=(0, 0))\n",
+    "p_sig = Process(r\"Signal\", range=(1, 1))\n",
+    "\n",
+    "p_asimov = Process(r\"Asimov\", selection=lambda d: d.fpid >= 0)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Stacks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "colors = [\"windows blue\", \"amber\", \"greyish\", \"faded green\", \"dusty purple\"]\n",
+    "s_mc = McStack(p_ztt, p_sig, palette=sns.xkcd_palette(colors))\n",
+    "s_data = DataStack(p_asimov)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Blinding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b_higgs_m = RangeBlindingStrategy(100, 150)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Variables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "v_higgs_m = Variable(r\"$m^H$\", \"higgs_m\", \"GeV\", blinding=b_higgs_m)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Actual plot\n",
+    "\n",
+    "Stacks passed to `blind` argument will be blinded according to blind strategy of variable `v_higgs_m`. \n",
+    "## Blind data stack"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hist(df, v_higgs_m, 20, [s_mc, s_data], range=(0, 200),\n",
+    "     weight=\"weight\",  ratio_label=\"Data / SM\", blind=s_data, diff=True)\n",
+    "None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Blind MC stack"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hist(df, v_higgs_m, 20, [s_mc, s_data], range=(0, 200),\n",
+    "     weight=\"weight\",  ratio_label=\"Data / SM\", blind=s_mc)\n",
+    "None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Blind both stacks\n",
+    "`blind` argument can be a single stack or a list of stacks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hist(df, v_higgs_m, 20, [s_mc, s_data], range=(0, 200),\n",
+    "     weight=\"weight\",  ratio_label=\"Data / SM\", blind=[s_mc, s_data])\n",
+    "None"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+import seaborn as sns
+
+from nnfwtbn import Variable, Process, Cut, hist, McStack, DataStack, Stack, \
+                    RangeBlindingStrategy
+from nnfwtbn import toydata
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df = toydata.get()
+```
+
+%% Cell type:markdown id: tags:
+
+# Processes
+
+%% Cell type:code id: tags:
+
+``` python
+p_ztt = Process(r"$Z\rightarrow\tau\tau$", range=(0, 0))
+p_sig = Process(r"Signal", range=(1, 1))
+
+p_asimov = Process(r"Asimov", selection=lambda d: d.fpid >= 0)
+```
+
+%% Cell type:markdown id: tags:
+
+# Stacks
+
+%% Cell type:code id: tags:
+
+``` python
+colors = ["windows blue", "amber", "greyish", "faded green", "dusty purple"]
+s_mc = McStack(p_ztt, p_sig, palette=sns.xkcd_palette(colors))
+s_data = DataStack(p_asimov)
+```
+
+%% Cell type:markdown id: tags:
+
+# Blinding
+
+%% Cell type:code id: tags:
+
+``` python
+b_higgs_m = RangeBlindingStrategy(100, 150)
+```
+
+%% Cell type:markdown id: tags:
+
+# Variables
+
+%% Cell type:code id: tags:
+
+``` python
+v_higgs_m = Variable(r"$m^H$", "higgs_m", "GeV", blinding=b_higgs_m)
+```
+
+%% Cell type:markdown id: tags:
+
+# Actual plot
+
+Stacks passed to `blind` argument will be blinded according to blind strategy of variable `v_higgs_m`.
+## Blind data stack
+
+%% Cell type:code id: tags:
+
+``` python
+hist(df, v_higgs_m, 20, [s_mc, s_data], range=(0, 200),
+     weight="weight",  ratio_label="Data / SM", blind=s_data, diff=True)
+None
+```
+
+%% Cell type:markdown id: tags:
+
+## Blind MC stack
+
+%% Cell type:code id: tags:
+
+``` python
+hist(df, v_higgs_m, 20, [s_mc, s_data], range=(0, 200),
+     weight="weight",  ratio_label="Data / SM", blind=s_mc)
+None
+```
+
+%% Cell type:markdown id: tags:
+
+## Blind both stacks
+`blind` argument can be a single stack or a list of stacks.
+
+%% Cell type:code id: tags:
+
+``` python
+hist(df, v_higgs_m, 20, [s_mc, s_data], range=(0, 200),
+     weight="weight",  ratio_label="Data / SM", blind=[s_mc, s_data])
+None
+```
--- a/nnfwtbn/plot.py
+++ b/nnfwtbn/plot.py
@@ -10,9 +10,10 @@ import seaborn as sns

 from atlasify import atlasify

+from nnfwtbn.stack import Stack
 from nnfwtbn.process import Process
 from nnfwtbn.cut import Cut
-from nnfwtbn.variable import Variable
+from nnfwtbn.variable import Variable, BlindingStrategy
 import nnfwtbn.error as err

 # Store built-in range
@@ -75,9 +76,9 @@ def hist(dataframe, variable, bins, stacks, selection=None,
    Stacks must be Stack objects. The plotting style is defined via the stack
    object.

-    The optional blind argument controls which process should be blinded. The
-    argument can be a single process, a list of processes or None. By default,
-    no process is blinded.
+    The optional blind argument controls which stack should be blinded. The
+    argument can be a single stack, a list of stacks or None. By default,
+    no stack is blinded.

    If the figure argument is omitted, this method creates a new axes and
    figure. If axes only is omitted, the method creates a new axes from the
@@ -203,20 +204,32 @@ def hist(dataframe, variable, bins, stacks, selection=None,
    numerator_hist = None
    denominator_hist = None

+    if blind is None:
+        blind = []
+    elif isinstance(blind, Stack):
+        # Wrap scalar blind stack
+        blind = [blind]
+
    # Handle stack
    max_content = float('-inf')
    for i_stack, stack in enumerate(stacks):
        bottom = 0
        uncertainty = np.array(0)
+        if stack in blind and variable.blinding is not None:
+            c_blind = variable.blinding(variable, bins)
+        else:
+            c_blind = lambda d: d
+             
        for i_process in range_(len(stack)):
            process = stack.processes[i_process]
-            histogram = stack.get_hist(dataframe, i_process, bins, variable,
-                                       weight)
+            histogram = stack.get_hist(c_blind(dataframe), i_process, bins,
+                                       variable, weight)

            # Draw uncertainty iff this is the last processes of the stack
            if i_process == len(stack) - 1:
-                uncertainty = stack.get_total_uncertainty(dataframe, bins,
-                                                          variable, weight)
+                uncertainty = stack.get_total_uncertainty(c_blind(dataframe),
+                                                          bins, variable,
+                                                          weight)

            # Draw process
            histtype = stack.get_histtype(i_process)
@@ -224,8 +237,13 @@ def hist(dataframe, variable, bins, stacks, selection=None,
                kwds = {'markersize': 4, 'fmt': 'o'}
                kwds.update(stack.get_aux(i_process))

-                axes.errorbar(bin_centers, bottom + histogram, uncertainty,
-                              bin_widths / 2, label=process.label, **kwds)
+                non_empty = (histogram != 0)
+
+                axes.errorbar(bin_centers[non_empty],
+                              (bottom + histogram)[non_empty],
+                              uncertainty[non_empty],
+                              (bin_widths / 2)[non_empty],
+                              label=process.label, **kwds)
            else:
                kwds = {}
                kwds.update(stack.get_aux(i_process))
@@ -256,7 +274,7 @@ def hist(dataframe, variable, bins, stacks, selection=None,
    # Handle ratio plot
    if draw_ratio:
        _draw_ratio(dataframe, numerator, denominator, bins, variable, weight,
-                    axes_ratio, diff, ratio_range, ratio_label)
+                    axes_ratio, diff, ratio_range, ratio_label, blind)

    # Draw vertical lines
    for vline in vlines:
@@ -327,27 +345,37 @@ def hist(dataframe, variable, bins, stacks, selection=None,
        return figure, axes

 def _draw_ratio(df, numerator, denominator, bins, variable, weight,
-                axes_ratio, diff=False, ratio_range=None, ratio_label=None):
+                axes_ratio, diff=False, ratio_range=None, ratio_label=None,
+                blind=[]):

-    numerator_hist = numerator.get_total(df, bins, variable, weight)
-    numerator_error = numerator.get_total_uncertainty(df, bins, variable, weight)
-    denominator_hist = denominator.get_total(df, bins, variable, weight)
-    denominator_error = denominator.get_total_uncertainty(df, bins, variable, weight)
+    if numerator in blind and variable.blinding is not None:
+        c_blind = variable.blinding(variable, bins)
+    else:
+        c_blind = lambda d: d
+    numerator_hist = numerator.get_total(c_blind(df), bins, variable, weight)
+    numerator_error = numerator.get_total_uncertainty(c_blind(df), bins, variable, weight)
+
+    if denominator in blind and variable.blinding is not None:
+        c_blind = variable.blinding(variable, bins)
+    else:
+        c_blind = lambda d: d
+    denominator_hist = denominator.get_total(c_blind(df), bins, variable, weight)
+    denominator_error = denominator.get_total_uncertainty(c_blind(df), bins, variable, weight)

    # valid bins with non-zero denominator
-    idx = (denominator_hist != 0)
+    non_empty = (denominator_hist != 0) * (numerator_hist != 0)
    if diff:
-        idx = (idx == idx)
+        non_empty = (numerator_hist != 0)

-    bin_centers = ((bins[1:] + bins[:-1]) / 2)[idx]
-    bin_widths = (bins[1:] - bins[:-1])[idx]
+    bin_centers = ((bins[1:] + bins[:-1]) / 2)[non_empty]
+    bin_widths = (bins[1:] - bins[:-1])[non_empty]

    if diff:
-        ratio = numerator_hist[idx] - denominator_hist[idx]
-        ratio_error = numerator_error[idx]
+        ratio = numerator_hist[non_empty] - denominator_hist[non_empty]
+        ratio_error = numerator_error[non_empty]
    else:
-        ratio = numerator_hist[idx] / denominator_hist[idx]
-        ratio_error = numerator_error[idx] / denominator_hist[idx]
+        ratio = numerator_hist[non_empty] / denominator_hist[non_empty]
+        ratio_error = numerator_error[non_empty] / denominator_hist[non_empty]

    axes_ratio.errorbar(bin_centers,
                        ratio,
@@ -357,18 +385,20 @@ def _draw_ratio(df, numerator, denominator, bins, variable, weight,
                        markersize=4, fmt='o')

    if diff:
+        non_empty = np.ones(len(numerator_hist), dtype='bool')
        rel_error = denominator_error
        band_lower = 0 - rel_error
    else:
-        denominator_hist[~idx] = 1
-        numerator_error[idx] = 0
+        non_empty = (denominator_hist != 0)
+        denominator_hist[~non_empty] = 1
+        numerator_error[non_empty] = 0
        rel_error = denominator_error / denominator_hist
        band_lower = 1 - rel_error

-    axes_ratio.hist(bins[:-1],
+    axes_ratio.hist(bins[:-1][non_empty],
                    bins=bins,
                    bottom=band_lower,
-                    weights=rel_error * 2,
+                    weights=(rel_error * 2)[non_empty],
                    fill=False,
                    hatch='/////',
                    linewidth=0,