commit

1302d2a5 · Richard · ab9f85fb · 1302d2a5
Commit 1302d2a5 authored 2 years ago by Richard
--- a/notebooks/Part_2.ipynb
+++ b/notebooks/Part_2.ipynb
@@ -25,17 +25,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "id": "ae7784a0",
   "metadata": {},
   "outputs": [],
   "source": [
-    "import uproot"
+    "import uproot\n",
+    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "id": "df27fe5c",
   "metadata": {},
   "outputs": [],
@@ -80,7 +81,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
   "id": "759cf6e0",
   "metadata": {},
   "outputs": [],
@@ -93,21 +94,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "id": "293944b5",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<ReadOnlyDirectory '/' at 0x7f3534853b10>"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# open file\n",
    "file = uproot.open(path)\n",
@@ -116,21 +106,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
   "id": "e96d18ab",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['DecayTree;1']"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "# What is in the file?\n",
    "file.keys()"
@@ -138,21 +117,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
   "id": "5bd6121c",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<TTree 'DecayTree' (29 branches) at 0x7f35340b17d0>"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "decay_tree = file['DecayTree']\n",
    "decay_tree"
@@ -160,7 +128,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
   "id": "4da7b7c6",
   "metadata": {},
   "outputs": [],
@@ -170,385 +138,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "id": "dddf3ef6",
   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>index</th>\n",
-       "      <th>Jpsi_PE</th>\n",
-       "      <th>Jpsi_PX</th>\n",
-       "      <th>Jpsi_PY</th>\n",
-       "      <th>Jpsi_PZ</th>\n",
-       "      <th>Jpsi_PT</th>\n",
-       "      <th>Jpsi_P</th>\n",
-       "      <th>Jpsi_M</th>\n",
-       "      <th>mum_PT</th>\n",
-       "      <th>mum_PX</th>\n",
-       "      <th>...</th>\n",
-       "      <th>mup_PZ</th>\n",
-       "      <th>mup_IP</th>\n",
-       "      <th>mup_eta</th>\n",
-       "      <th>mup_M</th>\n",
-       "      <th>mup_PE</th>\n",
-       "      <th>nTracks</th>\n",
-       "      <th>mum_ProbNNmu</th>\n",
-       "      <th>mum_ProbNNpi</th>\n",
-       "      <th>mup_ProbNNmu</th>\n",
-       "      <th>mup_ProbNNpi</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0</td>\n",
-       "      <td>188.630181</td>\n",
-       "      <td>-1.700534</td>\n",
-       "      <td>-9.131937</td>\n",
-       "      <td>188.375806</td>\n",
-       "      <td>9.288923</td>\n",
-       "      <td>188.604688</td>\n",
-       "      <td>3.101106</td>\n",
-       "      <td>4.376341</td>\n",
-       "      <td>-2.246101</td>\n",
-       "      <td>...</td>\n",
-       "      <td>99.674146</td>\n",
-       "      <td>119.018213</td>\n",
-       "      <td>3.608728</td>\n",
-       "      <td>0.105658</td>\n",
-       "      <td>99.820565</td>\n",
-       "      <td>149.0</td>\n",
-       "      <td>0.999983</td>\n",
-       "      <td>0.836058</td>\n",
-       "      <td>0.999994</td>\n",
-       "      <td>0.244674</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>1</td>\n",
-       "      <td>52.385685</td>\n",
-       "      <td>1.816164</td>\n",
-       "      <td>5.595537</td>\n",
-       "      <td>51.961499</td>\n",
-       "      <td>5.882897</td>\n",
-       "      <td>52.293459</td>\n",
-       "      <td>3.107116</td>\n",
-       "      <td>1.735741</td>\n",
-       "      <td>1.552217</td>\n",
-       "      <td>...</td>\n",
-       "      <td>41.621295</td>\n",
-       "      <td>210.293355</td>\n",
-       "      <td>2.851094</td>\n",
-       "      <td>0.105658</td>\n",
-       "      <td>41.900278</td>\n",
-       "      <td>125.0</td>\n",
-       "      <td>0.998874</td>\n",
-       "      <td>0.264369</td>\n",
-       "      <td>0.999999</td>\n",
-       "      <td>0.391294</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>2</td>\n",
-       "      <td>52.068478</td>\n",
-       "      <td>2.552368</td>\n",
-       "      <td>2.817129</td>\n",
-       "      <td>51.837748</td>\n",
-       "      <td>3.801420</td>\n",
-       "      <td>51.976946</td>\n",
-       "      <td>3.086004</td>\n",
-       "      <td>1.110952</td>\n",
-       "      <td>0.179505</td>\n",
-       "      <td>...</td>\n",
-       "      <td>20.279673</td>\n",
-       "      <td>38.272015</td>\n",
-       "      <td>2.632559</td>\n",
-       "      <td>0.105658</td>\n",
-       "      <td>20.490677</td>\n",
-       "      <td>371.0</td>\n",
-       "      <td>0.538509</td>\n",
-       "      <td>0.313881</td>\n",
-       "      <td>0.882305</td>\n",
-       "      <td>0.961390</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>3</td>\n",
-       "      <td>78.399724</td>\n",
-       "      <td>-2.833082</td>\n",
-       "      <td>-0.818953</td>\n",
-       "      <td>78.283360</td>\n",
-       "      <td>2.949075</td>\n",
-       "      <td>78.338889</td>\n",
-       "      <td>3.087923</td>\n",
-       "      <td>2.571993</td>\n",
-       "      <td>-2.028028</td>\n",
-       "      <td>...</td>\n",
-       "      <td>9.020064</td>\n",
-       "      <td>134.767864</td>\n",
-       "      <td>2.792800</td>\n",
-       "      <td>0.105658</td>\n",
-       "      <td>9.088611</td>\n",
-       "      <td>136.0</td>\n",
-       "      <td>0.896250</td>\n",
-       "      <td>0.792830</td>\n",
-       "      <td>0.999992</td>\n",
-       "      <td>0.724581</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4</td>\n",
-       "      <td>83.900727</td>\n",
-       "      <td>-5.065507</td>\n",
-       "      <td>-3.457333</td>\n",
-       "      <td>83.618226</td>\n",
-       "      <td>6.132904</td>\n",
-       "      <td>83.842831</td>\n",
-       "      <td>3.116368</td>\n",
-       "      <td>3.698279</td>\n",
-       "      <td>-3.220143</td>\n",
-       "      <td>...</td>\n",
-       "      <td>16.851730</td>\n",
-       "      <td>2926.081975</td>\n",
-       "      <td>2.619576</td>\n",
-       "      <td>0.105658</td>\n",
-       "      <td>17.031800</td>\n",
-       "      <td>71.0</td>\n",
-       "      <td>0.998548</td>\n",
-       "      <td>0.270670</td>\n",
-       "      <td>0.999987</td>\n",
-       "      <td>0.921856</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>168380</th>\n",
-       "      <td>168380</td>\n",
-       "      <td>53.174255</td>\n",
-       "      <td>-1.338545</td>\n",
-       "      <td>1.150779</td>\n",
-       "      <td>53.041120</td>\n",
-       "      <td>1.765218</td>\n",
-       "      <td>53.070486</td>\n",
-       "      <td>3.320384</td>\n",
-       "      <td>1.920701</td>\n",
-       "      <td>-1.838330</td>\n",
-       "      <td>...</td>\n",
-       "      <td>38.316972</td>\n",
-       "      <td>1049.370917</td>\n",
-       "      <td>4.592049</td>\n",
-       "      <td>0.105658</td>\n",
-       "      <td>38.324986</td>\n",
-       "      <td>45.0</td>\n",
-       "      <td>0.999973</td>\n",
-       "      <td>0.957454</td>\n",
-       "      <td>0.985793</td>\n",
-       "      <td>0.380145</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>168381</th>\n",
-       "      <td>168381</td>\n",
-       "      <td>37.853732</td>\n",
-       "      <td>-0.429858</td>\n",
-       "      <td>-1.214533</td>\n",
-       "      <td>37.695700</td>\n",
-       "      <td>1.288359</td>\n",
-       "      <td>37.717710</td>\n",
-       "      <td>3.206147</td>\n",
-       "      <td>1.313456</td>\n",
-       "      <td>1.312666</td>\n",
-       "      <td>...</td>\n",
-       "      <td>25.752567</td>\n",
-       "      <td>244.663700</td>\n",
-       "      <td>3.202198</td>\n",
-       "      <td>0.105658</td>\n",
-       "      <td>25.838128</td>\n",
-       "      <td>127.0</td>\n",
-       "      <td>0.999685</td>\n",
-       "      <td>0.705155</td>\n",
-       "      <td>0.289150</td>\n",
-       "      <td>0.361121</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>168382</th>\n",
-       "      <td>168382</td>\n",
-       "      <td>49.467924</td>\n",
-       "      <td>-2.862092</td>\n",
-       "      <td>1.562724</td>\n",
-       "      <td>49.268784</td>\n",
-       "      <td>3.260932</td>\n",
-       "      <td>49.376581</td>\n",
-       "      <td>3.004789</td>\n",
-       "      <td>0.959976</td>\n",
-       "      <td>0.457486</td>\n",
-       "      <td>...</td>\n",
-       "      <td>41.601480</td>\n",
-       "      <td>613.611564</td>\n",
-       "      <td>3.200199</td>\n",
-       "      <td>0.105658</td>\n",
-       "      <td>41.740035</td>\n",
-       "      <td>122.0</td>\n",
-       "      <td>0.979872</td>\n",
-       "      <td>0.210254</td>\n",
-       "      <td>0.546198</td>\n",
-       "      <td>0.450897</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>168383</th>\n",
-       "      <td>168383</td>\n",
-       "      <td>37.154420</td>\n",
-       "      <td>-0.461408</td>\n",
-       "      <td>-0.982375</td>\n",
-       "      <td>37.034886</td>\n",
-       "      <td>1.085338</td>\n",
-       "      <td>37.050786</td>\n",
-       "      <td>2.773115</td>\n",
-       "      <td>1.772731</td>\n",
-       "      <td>0.098034</td>\n",
-       "      <td>...</td>\n",
-       "      <td>20.800347</td>\n",
-       "      <td>304.353561</td>\n",
-       "      <td>3.763140</td>\n",
-       "      <td>0.105658</td>\n",
-       "      <td>20.823039</td>\n",
-       "      <td>139.0</td>\n",
-       "      <td>0.999972</td>\n",
-       "      <td>0.466231</td>\n",
-       "      <td>0.978200</td>\n",
-       "      <td>0.482338</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>168384</th>\n",
-       "      <td>168384</td>\n",
-       "      <td>29.716630</td>\n",
-       "      <td>1.606319</td>\n",
-       "      <td>-0.307454</td>\n",
-       "      <td>29.542023</td>\n",
-       "      <td>1.635478</td>\n",
-       "      <td>29.587259</td>\n",
-       "      <td>2.769874</td>\n",
-       "      <td>1.477885</td>\n",
-       "      <td>0.850150</td>\n",
-       "      <td>...</td>\n",
-       "      <td>18.002864</td>\n",
-       "      <td>176.687005</td>\n",
-       "      <td>3.058545</td>\n",
-       "      <td>0.105658</td>\n",
-       "      <td>18.082736</td>\n",
-       "      <td>133.0</td>\n",
-       "      <td>0.930024</td>\n",
-       "      <td>0.737257</td>\n",
-       "      <td>0.998675</td>\n",
-       "      <td>0.230615</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>168385 rows × 29 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "         index     Jpsi_PE   Jpsi_PX   Jpsi_PY     Jpsi_PZ   Jpsi_PT  \\\n",
-       "0            0  188.630181 -1.700534 -9.131937  188.375806  9.288923   \n",
-       "1            1   52.385685  1.816164  5.595537   51.961499  5.882897   \n",
-       "2            2   52.068478  2.552368  2.817129   51.837748  3.801420   \n",
-       "3            3   78.399724 -2.833082 -0.818953   78.283360  2.949075   \n",
-       "4            4   83.900727 -5.065507 -3.457333   83.618226  6.132904   \n",
-       "...        ...         ...       ...       ...         ...       ...   \n",
-       "168380  168380   53.174255 -1.338545  1.150779   53.041120  1.765218   \n",
-       "168381  168381   37.853732 -0.429858 -1.214533   37.695700  1.288359   \n",
-       "168382  168382   49.467924 -2.862092  1.562724   49.268784  3.260932   \n",
-       "168383  168383   37.154420 -0.461408 -0.982375   37.034886  1.085338   \n",
-       "168384  168384   29.716630  1.606319 -0.307454   29.542023  1.635478   \n",
-       "\n",
-       "            Jpsi_P    Jpsi_M    mum_PT    mum_PX  ...     mup_PZ       mup_IP  \\\n",
-       "0       188.604688  3.101106  4.376341 -2.246101  ...  99.674146   119.018213   \n",
-       "1        52.293459  3.107116  1.735741  1.552217  ...  41.621295   210.293355   \n",
-       "2        51.976946  3.086004  1.110952  0.179505  ...  20.279673    38.272015   \n",
-       "3        78.338889  3.087923  2.571993 -2.028028  ...   9.020064   134.767864   \n",
-       "4        83.842831  3.116368  3.698279 -3.220143  ...  16.851730  2926.081975   \n",
-       "...            ...       ...       ...       ...  ...        ...          ...   \n",
-       "168380   53.070486  3.320384  1.920701 -1.838330  ...  38.316972  1049.370917   \n",
-       "168381   37.717710  3.206147  1.313456  1.312666  ...  25.752567   244.663700   \n",
-       "168382   49.376581  3.004789  0.959976  0.457486  ...  41.601480   613.611564   \n",
-       "168383   37.050786  2.773115  1.772731  0.098034  ...  20.800347   304.353561   \n",
-       "168384   29.587259  2.769874  1.477885  0.850150  ...  18.002864   176.687005   \n",
-       "\n",
-       "         mup_eta     mup_M     mup_PE  nTracks  mum_ProbNNmu  mum_ProbNNpi  \\\n",
-       "0       3.608728  0.105658  99.820565    149.0      0.999983      0.836058   \n",
-       "1       2.851094  0.105658  41.900278    125.0      0.998874      0.264369   \n",
-       "2       2.632559  0.105658  20.490677    371.0      0.538509      0.313881   \n",
-       "3       2.792800  0.105658   9.088611    136.0      0.896250      0.792830   \n",
-       "4       2.619576  0.105658  17.031800     71.0      0.998548      0.270670   \n",
-       "...          ...       ...        ...      ...           ...           ...   \n",
-       "168380  4.592049  0.105658  38.324986     45.0      0.999973      0.957454   \n",
-       "168381  3.202198  0.105658  25.838128    127.0      0.999685      0.705155   \n",
-       "168382  3.200199  0.105658  41.740035    122.0      0.979872      0.210254   \n",
-       "168383  3.763140  0.105658  20.823039    139.0      0.999972      0.466231   \n",
-       "168384  3.058545  0.105658  18.082736    133.0      0.930024      0.737257   \n",
-       "\n",
-       "        mup_ProbNNmu  mup_ProbNNpi  \n",
-       "0           0.999994      0.244674  \n",
-       "1           0.999999      0.391294  \n",
-       "2           0.882305      0.961390  \n",
-       "3           0.999992      0.724581  \n",
-       "4           0.999987      0.921856  \n",
-       "...              ...           ...  \n",
-       "168380      0.985793      0.380145  \n",
-       "168381      0.289150      0.361121  \n",
-       "168382      0.546198      0.450897  \n",
-       "168383      0.978200      0.482338  \n",
-       "168384      0.998675      0.230615  \n",
-       "\n",
-       "[168385 rows x 29 columns]"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "data"
   ]

 %% Cell type:markdown id:50976bc4 tags:

 # Python - part 2

 Now we focus on working with data in python

 To load the data, we will use uproot, a python package that lets you access root files in python

 %% Cell type:markdown id:d6bd170a tags:

 ## Uproot
 two good sources:
 - https://masonproffitt.github.io/uproot-tutorial/ --- nice specific tutorial
 - https://uproot.readthedocs.io/en/latest/basic.html --- bit more in-depth

 %% Cell type:code id:ae7784a0 tags:

 ``` python
 import uproot
+import pandas as pd
 ```

 %% Cell type:code id:df27fe5c tags:

 ``` python
 import numpy as np
 import matplotlib.pyplot as plt
 import mplhep
 plt.style.use(mplhep.style.LHCb2)
 ```

 %% Cell type:markdown id:dd76dc9b tags:

 ---

 %% Cell type:markdown id:7be60f91 tags:

 ## We have data from $J/ \Psi \rightarrow \mu^+ \mu^-$ data from the LHCb detector

 %% Cell type:markdown id:a5123d9f tags:

 One can load the data as a numpy, pandas, or awkward object. We have covered numpy, and so we will go for pandas, as it has some functionality that is especially great for doing quick and easy-to-read operations on data

 %% Cell type:markdown id:3c7c1138 tags:

 ### What is uproot

 %% Cell type:code id:759cf6e0 tags:

 ``` python
 # define where file is:
 path = 'https://cern.ch/starterkit/data/advanced-python-2018/real_data.root'
 # alternate
 path = "./data.root"
 ```

 %% Cell type:code id:293944b5 tags:

 ``` python
 # open file
 file = uproot.open(path)
 file
 ```

-%% Output
-
-    <ReadOnlyDirectory '/' at 0x7f3534853b10>
-
 %% Cell type:code id:e96d18ab tags:

 ``` python
 # What is in the file?
 file.keys()
 ```

-%% Output
-
-    ['DecayTree;1']
-
 %% Cell type:code id:5bd6121c tags:

 ``` python
 decay_tree = file['DecayTree']
 decay_tree
 ```

-%% Output
-
-    <TTree 'DecayTree' (29 branches) at 0x7f35340b17d0>
-
 %% Cell type:code id:4da7b7c6 tags:

 ``` python
 data = decay_tree.arrays(library='pd')
 ```

 %% Cell type:code id:dddf3ef6 tags:

 ``` python
 data
 ```

-%% Output
-
-             index     Jpsi_PE   Jpsi_PX   Jpsi_PY     Jpsi_PZ   Jpsi_PT  \
-    0            0  188.630181 -1.700534 -9.131937  188.375806  9.288923
-    1            1   52.385685  1.816164  5.595537   51.961499  5.882897
-    2            2   52.068478  2.552368  2.817129   51.837748  3.801420
-    3            3   78.399724 -2.833082 -0.818953   78.283360  2.949075
-    4            4   83.900727 -5.065507 -3.457333   83.618226  6.132904
-    ...        ...         ...       ...       ...         ...       ...
-    168380  168380   53.174255 -1.338545  1.150779   53.041120  1.765218
-    168381  168381   37.853732 -0.429858 -1.214533   37.695700  1.288359
-    168382  168382   49.467924 -2.862092  1.562724   49.268784  3.260932
-    168383  168383   37.154420 -0.461408 -0.982375   37.034886  1.085338
-    168384  168384   29.716630  1.606319 -0.307454   29.542023  1.635478
-    
-                Jpsi_P    Jpsi_M    mum_PT    mum_PX  ...     mup_PZ       mup_IP  \
-    0       188.604688  3.101106  4.376341 -2.246101  ...  99.674146   119.018213
-    1        52.293459  3.107116  1.735741  1.552217  ...  41.621295   210.293355
-    2        51.976946  3.086004  1.110952  0.179505  ...  20.279673    38.272015
-    3        78.338889  3.087923  2.571993 -2.028028  ...   9.020064   134.767864
-    4        83.842831  3.116368  3.698279 -3.220143  ...  16.851730  2926.081975
-    ...            ...       ...       ...       ...  ...        ...          ...
-    168380   53.070486  3.320384  1.920701 -1.838330  ...  38.316972  1049.370917
-    168381   37.717710  3.206147  1.313456  1.312666  ...  25.752567   244.663700
-    168382   49.376581  3.004789  0.959976  0.457486  ...  41.601480   613.611564
-    168383   37.050786  2.773115  1.772731  0.098034  ...  20.800347   304.353561
-    168384   29.587259  2.769874  1.477885  0.850150  ...  18.002864   176.687005
-    
-             mup_eta     mup_M     mup_PE  nTracks  mum_ProbNNmu  mum_ProbNNpi  \
-    0       3.608728  0.105658  99.820565    149.0      0.999983      0.836058
-    1       2.851094  0.105658  41.900278    125.0      0.998874      0.264369
-    2       2.632559  0.105658  20.490677    371.0      0.538509      0.313881
-    3       2.792800  0.105658   9.088611    136.0      0.896250      0.792830
-    4       2.619576  0.105658  17.031800     71.0      0.998548      0.270670
-    ...          ...       ...        ...      ...           ...           ...
-    168380  4.592049  0.105658  38.324986     45.0      0.999973      0.957454
-    168381  3.202198  0.105658  25.838128    127.0      0.999685      0.705155
-    168382  3.200199  0.105658  41.740035    122.0      0.979872      0.210254
-    168383  3.763140  0.105658  20.823039    139.0      0.999972      0.466231
-    168384  3.058545  0.105658  18.082736    133.0      0.930024      0.737257
-    
-            mup_ProbNNmu  mup_ProbNNpi
-    0           0.999994      0.244674
-    1           0.999999      0.391294
-    2           0.882305      0.961390
-    3           0.999992      0.724581
-    4           0.999987      0.921856
-    ...              ...           ...
-    168380      0.985793      0.380145
-    168381      0.289150      0.361121
-    168382      0.546198      0.450897
-    168383      0.978200      0.482338
-    168384      0.998675      0.230615
-    
-    [168385 rows x 29 columns]
-
 %% Cell type:code id:af6cacfd tags:

 ``` python
 data.Jpsi_M
 ```

 %% Cell type:markdown id:8684e2ab tags:

 uproot arrays method has some useful capabilities.

 First there is the expressions option, which specifies which variables we want. For a very large data set we may not be able to keep all the variables in memory, or we may just limit to a subset to keep it quick.

 %% Cell type:code id:bb840592 tags:

 ``` python
 branches_we_want = ["Jpsi_M","nTracks"]

 data = decay_tree.arrays( expressions = branches_we_want , library='pd')
 data
 ```

 %% Cell type:markdown id:80cb1e54 tags:

 We can also use aliases, which allow us to both rename variables, and compute expressions

 %% Cell type:code id:d5667429 tags:

 ``` python
 branches_we_want = ["Jpsi_M","nTracks","MyVariable_1","MyVariable_2","Jpsi_PT"]
 aliases = {"MyVariable_1":"Jpsi_PT",
           "MyVariable_2":"sqrt( mum_PX**2 + mum_PY**2 + mum_PZ**2)",
          }

 data = decay_tree.arrays( expressions = branches_we_want , aliases = aliases , library='pd')
 data
 ```

 %% Cell type:markdown id:c6c36bed tags:

 The arrays method has more options, see https://uproot.readthedocs.io/en/latest/uproot.behaviors.TBranch.HasBranches.html#uproot-behaviors-tbranch-hasbranches-arrays

 %% Cell type:markdown id:0fe99007 tags:

 We do need to close the file too

 %% Cell type:code id:8a7552f5 tags:

 ``` python
 file.close()
 ```

 %% Cell type:markdown id:683a267b tags:

 So there you go! We have accessed the data in this root file. There are some more complicated things one can do with uproot, which the links at the beginning should have information on, but you may never need to do anything more complicated

 %% Cell type:markdown id:86a82f58 tags:

 ### Let's get started with the data

 %% Cell type:code id:0f2fa652 tags:

 ``` python
 with uproot.open('https://cern.ch/starterkit/data/advanced-python-2018/real_data.root') as file:
    tree = file['DecayTree']
    data_df = tree.arrays(library='pd')
 ## we have put this in a with statement so the file is automatically closed once we leave the statement
 ## prevents wasting memory
 ```

 %% Cell type:markdown id:d2282321 tags:

 In case that doesn't work

 %% Cell type:code id:b05952b2 tags:

 ``` python
 import pandas as pd
 data_df = pd.read_csv("./Data_for_part_2.csv")
 ```

 %% Cell type:code id:ee80c895 tags:

 ``` python
 with uproot.open("./data.root") as file:
    tree = file['DecayTree']
    data_df = tree.arrays(library='pd')
 ```

 %% Cell type:markdown id:7e41618b tags:

 ## lots of data!
 what is all this? using .columns method, we can see all the data we have

 %% Cell type:code id:fb3340c4 tags:

 ``` python
 data_df.columns
 ```

 %% Cell type:markdown id:1c676ddf tags:

 Let us have a look at just the $J/\Psi$ mass

 %% Cell type:code id:30faa5b3 tags:

 ``` python
 Jpsi_mass = data_df['Jpsi_M']
 ```

 %% Cell type:markdown id:dbbb7718 tags:

 How about a plot

 %% Cell type:markdown id:480f4021 tags:

 ## challenge 1 - Make a histogram of the $J/\Psi$ mass with 100 bins

 %% Cell type:code id:b5127d8c tags:

 ``` python
 ### space to do challenge

 Jpsi_mass = data_df['Jpsi_M']
 plt.hist(Jpsi_mass , bins = 100);

 ### hint: plt.hist( X ) makes a histogram of the data X
 ### hint: plt.hist( X , bins = N ) makes a histogram of the data X with N bins
 ```

 %% Cell type:markdown id:dc937608 tags:

 ---

 %% Cell type:markdown id:352ac0c8 tags:

 ---

 %% Cell type:markdown id:311e90e5 tags:

 ### Cool way to do plots

 %% Cell type:markdown id:d32405b6 tags:

 We will make a useful function to do plots. Don't worry what it is doing in any detail. T

 here is some documentation here https://mplhep.readthedocs.io/en/latest/api.html#mplhep.histplot

 %% Cell type:code id:b53bca6d tags:

 ``` python
 def plot_mass(df, label = None, density = False, bins = 100):
    h, bins = np.histogram(df['Jpsi_M'], bins=bins, range=[2.75, 3.5])
    mplhep.histplot(h, bins, density=density, yerr=True, label = label)
    plt.xlabel('$J/\\psi$ mass [GeV]')
    plt.xlim(bins[0], bins[-1])
 ```

 %% Cell type:code id:8cf8cb81 tags:

 ``` python
 plot_mass(data_df, label = None, density = False, bins = 100)
 ```

 %% Cell type:markdown id:53a019f1 tags:

 ---

 %% Cell type:markdown id:47614f0f tags:

 ---

 %% Cell type:markdown id:ca989c83 tags:

 ### working with the data
 Pandas let's us play with the data to make other variables in simple strings: this used the data frames eval method

 %% Cell type:markdown id:8f92507c tags:

 ### $\eta = arctanh( \frac{p_z}{p} )$
 eta is a common particle physics parameter

 %% Cell type:code id:28da20a0 tags:

 ``` python
 data_df.eval('Jpsi_eta = arctanh(Jpsi_PZ/Jpsi_P)', inplace=True)
 ```

 %% Cell type:code id:3edf7fda tags:

 ``` python
 data_df["Jpsi_eta"]
 ```

 %% Cell type:markdown id:a2381f11 tags:

 We have now computed eta for Jpsi for each event, and added it to the data frame as if it was in the root file! Very convenient

 %% Cell type:markdown id:f32cfa60 tags:

 ## challenge 2 - Add variables mup_P & mum_P
 the total momentum of the $\mu^+$ and $\mu^-$ respectivley

 %% Cell type:code id:c16e56cd tags:

 ``` python
 ### space to do challenge
 data_df.eval('mup_P = sqrt( mup_PX**2 + mup_PY**2 + mup_PZ**2)', inplace=True)
 data_df.eval('mum_P = sqrt( mum_PX**2 + mum_PY**2 + mum_PZ**2)', inplace=True)

 ### hint: the sqrt() method can be used in the pandas strings
 ### hint: the total momentum: P = sqrt( Px**2 + Py**2 + Pz**2 )
 ### hint: look at the data frame and it's columns (use data_df.columns) to see what data is available
 ### hint: mup = mu+ ; mum = mu-
 ```

 %% Cell type:markdown id:ca022e8a tags:

 ---

 %% Cell type:markdown id:b58bba9e tags:

 ---

 %% Cell type:markdown id:c5a58215 tags:

 ## Selection
 We can remove any rows of our data frame that correspond to a False result in some statement

 %% Cell type:code id:a7edbbe1 tags:

 ``` python
 Jpsi_PT_before_cut = data_df.Jpsi_PT
 data_with_cuts_df = data_df.query('Jpsi_PT > 4')

 Jpsi_PT_after_cut = data_with_cuts_df.Jpsi_PT
 ```

 %% Cell type:code id:9e953cfe tags:

 ``` python
 plt.hist(Jpsi_PT_before_cut, label="No cut",bins=100)
 plt.hist(Jpsi_PT_after_cut, label="PT cut",bins=100)
 plt.xlabel("$J/\Psi \; P_T$")
 plt.legend()
 ```

 %% Cell type:markdown id:dd873b4b tags:

 ## challenge 3 -
 Make a selection on $J/\Psi$ Mass ($J/\Psi$ Mass > 3.0 GeV) & make a histogram of the mass, including both before and after the cut

 %% Cell type:code id:3da8b30d tags:

 ``` python
 Jpsi_M_before_cut = data_df.Jpsi_M
 data_with_cuts_df = data_df.query('Jpsi_M > 3.0')

 Jpsi_M_after_cut = data_with_cuts_df.Jpsi_M
 plt.hist(Jpsi_M_before_cut, label="No cut",bins=100)
 plt.hist(Jpsi_M_after_cut, label="Jpsi_M cut",bins=100)
 plt.xlabel("$J/\Psi \; M$")
 plt.legend()
 ```

 %% Cell type:markdown id:5b683888 tags:

 ### Background reduction

 %% Cell type:markdown id:ab84263d tags:

 There is always background in real data, so let's try a cut to improve our signal sensitivity.

 %% Cell type:code id:668f1bc8 tags:

 ``` python
 plot_mass(data_df , density = False , label = "no cuts")
 data_with_cuts_df = data_df.query('(Jpsi_PT > 4) & ((mum_ProbNNmu > 0.5) & (mup_ProbNNmu > 0.5))')
 plot_mass(data_with_cuts_df , density = False  , label = r"cuts" )
 plt.legend(loc='best')
 ```

 %% Cell type:markdown id:c648bc60 tags:

 Let's try a density plot to make this a bit clearer to see

 %% Cell type:code id:0dcfd10d tags:

 ``` python
 plot_mass(data_df , density = True , label = "no cuts")
 data_with_cuts_df = data_df.query('(Jpsi_PT > 4) & ((mum_ProbNNmu > 0.5) & (mup_ProbNNmu > 0.5))')
 plot_mass(data_with_cuts_df , density = True  , label = r"cuts" )
 plt.legend(loc='best')
 ```

 %% Cell type:markdown id:673c8aac tags:

 ---

 %% Cell type:markdown id:0de6de84 tags:

 ---

 %% Cell type:markdown id:f12ad316 tags:

 # Use simulation

 %% Cell type:markdown id:c988818a tags:

 To help us figure out what cuts will and won't discriminate between signal and background, rather than just trying to make the histogram pointier, we can look at simulation of signal, and compare it to background

 We get out signal from monte-carlo (MC) simulation

 We use the Sidebands ($J/\Psi$ candidate mass either side of where the signal process is) to model background

 %% Cell type:code id:1c1799ef tags:

 ``` python
 with uproot.open('https://starterkit.web.cern.ch/starterkit/data/advanced-python-2018/simulated_data.root') as mc_file:
    mc_df = mc_file['DecayTree'].arrays(library='pd')
 ```

 %% Cell type:code id:fa0687a4 tags:

 ``` python
 # alternate
 with uproot.open('./MC.root') as mc_file:
    mc_df = mc_file['DecayTree'].arrays(library='pd')
 ```

 %% Cell type:markdown id:0d185edb tags:

 In case that doesn't work, I have a .csv file

 %% Cell type:code id:adb96504 tags:

 ``` python
 mc_df = pd.read_csv("./Simulation_for_part_2.csv")
 ```

 %% Cell type:markdown id:09fb96ba tags:

 This is our $J/\Psi \rightarrow \mu^+ \mu^-$ simulation

 %% Cell type:code id:8b627db6 tags:

 ``` python
 plot_mass(mc_df )
 plt.title("Signal Simulation")
 ```

 %% Cell type:markdown id:73cf59dd tags:

 Often it is useful to use a logarithmic y-scale

 %% Cell type:code id:8a7d6394 tags:

 ``` python
 plot_mass(mc_df )
 plt.yscale("log")
 plt.title("Signal Simulation")
 ```

 %% Cell type:markdown id:e5ccde0f tags:

 # Background

 %% Cell type:markdown id:f267b253 tags:

 We get background from the sidebands

 If signal is roughly only within the region $3.0 $ GeV$ \; < \; M(J/\Psi) \; < \, 3.2 $ GeV, then we can use the data outside this region to model the background.

 Let's take data from outside this region

 %% Cell type:code id:c21de385 tags:

 ``` python
 bkg_df = data_df.query('~(3.0 < Jpsi_M < 3.2)')
 # "~(3.0 < Jpsi_M < 3.2)" = not in the [ 3.0 , 3.2 ] region
 plot_mass(bkg_df)
 ```

 %% Cell type:markdown id:677f3a6a tags:

 ### We now have:
 * bkgd. sample using the sideband: bkg_df
 * Signal. sample using simulation: mc_df

 Great, so now what?

 We can now inspect the different variables at hand to see if they will help disriminate between signal and background

 Let's start with $P_T$

 %% Cell type:code id:3c0c3f73 tags:

 ``` python
 bkgd_PT = bkg_df['Jpsi_PT']
 signal_PT = mc_df['Jpsi_PT']

 plt.hist(bkgd_PT , bins=100, density = True, range=[0, 10], label="Bkgd.", histtype='step')
 plt.hist(signal_PT , bins=100, density = True, range=[0, 10], label="Signal", histtype='step')

 plt.legend()
 ```

 %% Cell type:markdown id:0f1c20cd tags:

 So there is some discrimination! But maybe it's not that powerful

 Here we only have kinematic parameters ( They tell us about the particle momenta )

 We often use 'vertex' variables, since a powerful discrimination uses the vertexing of the B-meson, some examples

 - $\chi^2_{IP}$ We may use the impact parameter of the muons (really the $\chi^2$ because we want to select on the confidence in the muon not being from the PV).
 ***
 - $\chi^2_{vertex}$ The confidence in the two muons having a common vertex, i.e., they came from a B meson.
 ***
 - DIRA -> When we reconstruct a b or c meson decay, we can compute it's momentum from the sum of momenta of daughters. We can also reconstruct it's decay vertex (the displaced vertex or DV) from the best fit common point of the daughter trajectories. The B momentum and the line connecting the PV and DV should be parallel, so it is useful in selecting against  combinatorial, where there isn't a B meson mother.

 %% Cell type:markdown id:c0ea61c6 tags:

 ![picture.png](attachment:picture.png)

 %% Cell type:markdown id:5c691621 tags:

 LHCb doesn't just use vertex information though, we have another powerful tool ... PID

 PID : Particle Idenficiation

 muons can often be misidentified as pions (and vice-versa) since they are similar in mass giving similar RICH signatures.

 We can discriminate pions and muons though. pions interact strongly and so leave signatures in the HCAL, muons pass by the calos more often, and we can observe them in the final part of the LHCb, the muon stations

 %% Cell type:markdown id:566012d1 tags:

 ### Let's look at the probability of the muon candidates being muons

 %% Cell type:code id:dfcb58ec tags:

 ``` python
 bkgd_PID = bkg_df['mup_ProbNNmu']*bkg_df['mum_ProbNNmu']
 signal_PID = mc_df['mup_ProbNNmu']*mc_df['mum_ProbNNmu']

 plt.hist(bkgd_PID , bins=100, density = True, range=[0, 1], label="Bkgd.", histtype='step')
 plt.hist(signal_PID , bins=100, density = True, range=[0, 1], label="Signal", histtype='step')
 plt.xlabel(r"$Prob(\mu^+|\mu)Prob(\mu^-|\mu)$")
 plt.legend()
 ```

 %% Cell type:markdown id:6b03435a tags:

 ### Let's look at the probability of muon candidates being pions

 %% Cell type:code id:30237c5a tags:

 ``` python
 bkgd_PID = bkg_df['mup_ProbNNpi']*bkg_df['mum_ProbNNpi']
 signal_PID = mc_df['mup_ProbNNpi']*mc_df['mum_ProbNNpi']

 plt.hist(bkgd_PID , bins=100, density = True, range=[0, 1], label="Bkgd.", histtype='step')
 plt.hist(signal_PID , bins=100, density = True, range=[0, 1], label="Signal", histtype='step')
 plt.xlabel(r"$Prob(\mu^+|\pi)Prob(\mu^-|\pi)$")

 plt.legend()
 ```

 %% Cell type:markdown id:b70768ef tags:

 So in this case PID information isn't going to help either

 Our best cut is probably in PT

 %% Cell type:code id:19cfa971 tags:

 ``` python
 data_with_cuts_df = data_df.query('(Jpsi_PT > 6)')
 plot_mass(data_with_cuts_df , label = r"$P_T$ > 6 GeV" , density = False )
 plt.legend(loc='best')
 ```

 %% Cell type:markdown id:ecc951d9 tags:

 ---

 %% Cell type:markdown id:60b949fc tags:

 ---

 %% Cell type:markdown id:9fe97433 tags:

 # Extension

 %% Cell type:markdown id:4919da38 tags:

 ## More on Uproot
 We typically work with .root files in HEP, and a useful tool in uproot is to take a pandas data frame, which maybe you have made selections and evaluations on and then make a new .root file. root files tend to be small in size due to compression, and easy for others to access.

 More detail here: https://uproot.readthedocs.io/en/latest/basic.html#writing-ttrees-to-a-file

 %% Cell type:code id:6bc13879 tags:

 ``` python
 with uproot.recreate("./MyRootFile.root") as file:
    file["Data"]=data_df
    file["Simulation"]=mc_df

 ### you have now made a new root file with trees of 'Data' and 'Simulation'
 ```

 %% Cell type:markdown id:0e04d8b3 tags:

 ## More on Pandas

 %% Cell type:markdown id:cc1b4222 tags:

 For my analyses I use uproot to load data, and then work with it using pandas, doing any analysis / fitting on data using zfit (see a future starterkit lesson)

 So let's go through pandas to see why it can be very useful.

 %% Cell type:code id:f6552dfc tags:

 ``` python
 import pandas as pd

 dictionary = {"a":[1,5,3],"b":[4,2,6]}

 data_frame = pd.DataFrame( dictionary )

 data_frame
 ```

 %% Cell type:markdown id:a2faf878 tags:

 We can turn a python dictionary made up of arrays into a pandas data frame

 We can now do some maths on this

 %% Cell type:code id:39d251ef tags:

 ``` python
 data_frame.eval( " c = a + 2*b " )
 ```

 %% Cell type:markdown id:de9b5f30 tags:

 What if I wanted to process the data over a custom function?

 %% Cell type:code id:63dcdfdb tags:

 ``` python
 def function( x , y ):
    return np.maximum( x , y )

 data_frame.eval('d = @function(a, b)')
 ```

 %% Cell type:markdown id:0b8c1759 tags:

 where did c go? We need to use inplace = True

 %% Cell type:code id:9f75de2d tags:

 ``` python
 data_frame.eval( " c = a + 2*b " , inplace = True)
 data_frame.eval('d = @function(a, b)' , inplace = True)
 data_frame
 ```

 %% Cell type:markdown id:c3c1b15f tags:

 The backend of pandas is numpy, so we need to have functions be in terms of numpy

 %% Cell type:markdown id:03c2c3eb tags:

 We can make cuts on pandas data frames:

 %% Cell type:code id:906b43fa tags:

 ``` python
 print("data frame with no cut:")
 print(data_frame)

 cut_string = "@function(a, b) < 6"
 print("* * * ")
 print("data frame with max(a,b) < 6:")
 print(data_frame.query(cut_string))
 ```

 %% Cell type:markdown id:b1238638 tags:

 We can even use f-strings

 %% Cell type:code id:1ba214e1 tags:

 ``` python
 print("no cut:")
 print(data_frame)

 Max = 6

 cut_string = f"@function(a, b) < {Max}"

 data_frame.query(cut_string)
 print("")
 print(f"max(a,b < {Max}):")
 print(data_frame.query(cut_string))
 ```

 %% Cell type:markdown id:779137ec tags:

 This allows you to process data in a very clear way, easy for someone to see what you are doing and spot mistakes. Clearer, shorter code also makes errors less common.

 %% Cell type:markdown id:5186b47e tags:

 ---