ethanscript.py

#this is a skeleton of a simple uproot script to loop over the existing
#musa vertexing ntuples to try to optimize selections at the vertex level, such as geometric quantities etc
#we want to select signal over a few different types of background, including
#qcd (in the form of non truth matched vertices from pileup overlay in signal samples), ttbar, and zmumu (not available yet)
#note that some of the definitions of these objects in the files might change -- this is still very much under development
#but the general principles should stay the same

#import ROOT
import numpy as np
import matplotlib.pyplot as plt
import uproot
import awkward as ak
from scipy.optimize import minimize
from scipy.optimize import curve_fit


uproot.default_library = "ak"

#load in the signal and background ntuples
#signal (use our 3GeV, 1000mm sample as the "standard candle")
signal = uproot.open("3ct1000.root")["AnalysisMiniTree"]
#ttbar
ttbar = uproot.open("ttbarntuple.root")["AnalysisMiniTree"]

print("Array-ifying the ntuples, might take a few minutes!")
print("This is a good time to get a coffee or something")
print("Starting signal...")
signalBranches = signal.arrays(filter_name="MuSAvtx_*", library="ak")
print("Starting ttbar...")
ttbarBranches = ttbar.arrays(filter_name="MuSAvtx_*", library="ak")
print("Starting qcd...")

# Create a mask: Select events where any element in MuSAvtx_hasNoTruthMatch is True
mask = ak.any(signalBranches["MuSAvtx_hasNoTruthMatch"] == 1, axis=1)
# Apply the mask to the branches
qcdBranches = signalBranches[mask]

#we also only want to consider the "good" vertices in the signal
mask = ak.any(signalBranches["MuSAvtx_hasFullTruthMatch"] == 1, axis=1)
signalBranches = signalBranches[mask]

#zmumu is not available yet...

#we want to determine the optimal selections on (but not limited to) the following variables:

#MuSAvtx_vPosMomAngT
#this is the cosine of the angle between the transverse displacement vector and the transverse momentum vector
#this measures how well the SV displacement aligns with the reconstructed pT direction
#values close to 1 incidcate good agreement -- you want the DV to point along the direction of the pT
#values close to -1 indicate poor agreement -- DVs point backwards "towards" the PV are less physical
#this was set to a simple threshold of -0.8 in the original standard vertexing

#MuSAvtx_vPosPVCompatibility
#this one is essentially the same thing as above, just in 3D. The name in the file is wrong -- it should be vPosMomAng3D or something
#but tells a slightly different story -- it's harder to fix the Z position in ATLAS because of collision dynamics
#so some of the features may be different than just in the transverse plane

#MuSAvtx_maxPVTrackCompatibility and MuSAvtx_minPVTrackCompatibility
#these quantify how well the phi of the muons associated with the SV align with the SV displacement vector
#basically cos(delta phi) -- values close to 1 indicate the track is aligned with the SV while -1 indicates it is anti-aligned
#max is the highest value in the pair and min is the lowest
#this basically checks if the tracks are more compatible with the PV or the SV. Tracks compatible with the PV are less likely to be a real displaced decay
#in the old vertexing, there were two options, the normal one failing the event if BOTH of these are less than -0.8
#but also a "strict" one that fails if EITHER of these are less than -0.8


#MuSAvtx_maxDeltaR
#this is the opening angle between the two muons in the vertex
#our signal is very collimated, so signal will have a small dR while background/random crossings can have any dR.
#this is a big difference so a powerful discriminator

#MuSAvtx_chi2
#this is the chi2 of the vertex fit. A good vertex will have a low chi2
#in the original vertexing this was required to be less than 5 -- here, it's a bit less of a good discriminator
#as our vertices have much much bigger uncertainties than the original vertexing, so the chi2 will be much larger
#there still maybe an optimal selection here combined with other things

#MuSAvtx_bob
#This is the "DV significance" -- effectively the displacement significance, or how far the SV is from the PV in units of combined uncertainty, the displacement vector divided by the fit covariance matrix
#If both PV and DV positions were measured perfectly (zero uncertainty), "bob" would be just the displacement, but if the PV or DV have large uncertainties, a big displacement might not be significant.
#By scaling displacement by the total uncertainty, "bob" gives a proper statistical measure of how unlikely it is for the DV to be at that location just due to measurement error
#we called it "bob" because it's not technically a "significance" in the proper statistical sense and couldn't settle on a name, so called it bob as a joke. Stuck around for good :P
#in the previous search the threshold here was 50, but the huge covariances mean that almost no MuSA vertices cross that threshold.

#MuSAvtx_Lxy
#this is just the radial displacement of the vertex in the transverse plane
#basically what you think of when you think of displaced -- how far out in the "cylinder" of ATLAS. We may want to set a minimum threshold to cut out background vertices that are too close to the PV
#but this could also cut out a lot of signal.

#MuSAvtx_maxd0 and MuSAvtx_mind0
#These are the minimum and maximum impact parameters of the two muons to the PV
#look up "transverse impact parameter" for more info as it's best explained with a picture -- basically how far the point of closest approach of the "arc" of the extrapolated muon track
#is from the PV in the transverse plane. This is a good discriminator because signal muons will have a large d0, while background muons will have a smaller d0
#but since we are considering very displaced, weird objects, it's less cut and dry than in a traditional search
#mind0 is the minimum of the two, maxd0 is the maximum of the two
#could have a cut on one or both or a combination of the two like with the pv track compatibilities

#MuSAvtx_eta and MuSAvtx_phi
#these are the pseudorapidity and azimuthal angle of the vertex, respectively
#just the coordinates of our decays -- but more forward (higher absolute value of eta) decays may have different features than more central decays
#so might be interesting to play around
#the phi you expect to be isotropic, so shouldn't have any impact.


#you can investigate "optimal" selections on these variables in a few ways
#1. just plot the variables for signal and background and see if there are any obvious differences (and get your starting point for the ranges etc)
#2. plot the signal efficiency vs background rejection for a few different selections
#3. use a machine learning algorithm to try to optimize the selections
#4. some combination of the above

# Extract variables for plotting
signal_vPosMomAngT = ak.to_numpy(ak.flatten(signalBranches["MuSAvtx_vPosMomAngT"]))
ttbar_vPosMomAngT = ak.to_numpy(ak.flatten(ttbarBranches["MuSAvtx_vPosMomAngT"]))
qcd_vPosMomAngT = ak.to_numpy(ak.flatten(qcdBranches["MuSAvtx_vPosMomAngT"]))

signal_maxDeltaR = ak.to_numpy(ak.flatten(signalBranches["MuSAvtx_maxDeltaR"]))
ttbar_maxDeltaR = ak.to_numpy(ak.flatten(ttbarBranches["MuSAvtx_maxDeltaR"]))
qcd_maxDeltaR = ak.to_numpy(ak.flatten(qcdBranches["MuSAvtx_maxDeltaR"]))

signal_chi2 = ak.to_numpy(ak.flatten(signalBranches["MuSAvtx_chi2"]))
ttbar_chi2 = ak.to_numpy(ak.flatten(ttbarBranches["MuSAvtx_chi2"]))
qcd_chi2 = ak.to_numpy(ak.flatten(qcdBranches["MuSAvtx_chi2"]))

signal_Lxy = ak.to_numpy(ak.flatten(signalBranches["MuSAvtx_Lxy"]))
ttbar_Lxy = ak.to_numpy(ak.flatten(ttbarBranches["MuSAvtx_Lxy"]))
qcd_Lxy = ak.to_numpy(ak.flatten(qcdBranches["MuSAvtx_Lxy"]))

#... and so on and so forth! Open up the ntuples and see what is inside...

#now, make a histogram of the variable for signal and background
#normalized to 1 so we can compare the shapes -- there will be a lot more signal than background in these comparisons numbers wise
#this is just a simple example -- you can make it as fancy as you want
plt.figure()
plt.hist(signal_vPosMomAngT, bins=100, density=True, alpha=0.5, label="signal")
plt.hist(ttbar_vPosMomAngT, bins=100, density=True, alpha=0.5, label="ttbar")
plt.hist(qcd_vPosMomAngT, bins=100, density=True, alpha=0.5, label="qcd")
plt.legend()
plt.savefig("1d_vPosMomAngT.png")

plt.figure()
plt.hist(signal_maxDeltaR, bins=100, density=True, alpha=0.5, label="signal")
plt.hist(ttbar_maxDeltaR, bins=100, density=True, alpha=0.5, label="ttbar")
plt.hist(qcd_maxDeltaR, bins=100, density=True, alpha=0.5, label="qcd")
plt.legend()
plt.savefig("1d_maxDeltaR.png")

#you can also make a 2D plot of two variables in signal and background to see if there are any correlations, dividing the signal and background samples etc
#then compare the shapes of the signal and background distributions
#this is a bit more complicated -- leaving it as an exercise for the reader, but remember you might have to change total number of bins some because
#the statistics are much lower in the background samples and will start to get really bad when you get granular in a 2D plot
#define s/sqrt(s+b) as the metric to optimize
#this is a common metric in HEP -- it's the significance of the signal over the square root of the total number of events
#it's a good metric because it takes into account both the signal and background, and the uncertainty on the signal
#you can also use other metrics -- you have to keep in mind purity, which is the fraction of signal events remaining in the selection ("true positives")
#vs background rejection -- the fraction of background events removed by the selection ("false positives")
#you can also use a machine learning algorithm to optimize the selections... but we'd like to avoid this at a first pass!

#first, define the metric as a function of the selection
def sob(signal, background, selection, cut_type="greater"):
    # Convert awkward arrays to numpy
    signal_np = ak.to_numpy(signal)
    background_np = ak.to_numpy(background)

    # Total counts before selection
    total_signal = len(signal_np)
    total_background = len(background_np)

    # Apply selection cuts dynamically
    if cut_type == "greater":
        signal_selected = signal_np[signal_np > selection]
        background_selected = background_np[background_np > selection]
    else:  # "less"
        signal_selected = signal_np[signal_np < selection]
        background_selected = background_np[background_np < selection]

    # Normalize by total counts
    s = len(signal_selected) / total_signal if total_signal > 0 else 0
    b = len(background_selected) / total_background if total_background > 0 else 0

    # Compute normalized S/sqrt(S+B)
    return s / np.sqrt(s + b) if (s + b) > 0 else 0


#now, loop over a few different selections and find the best one
#for example for vPosMomAngT, we know that the signal is peaked at 1 and the background is peaked at -1 roughly
#so want to select on the signal side of the peak, so look for a greater than selection
#note that here we're just doing ttbar for simplicity, you can add qcd or zmumu later!
best_selection = None
best_metric = 0
best_cut_type = None

print("Optimizing selection for vPosMomAngT...")

for selection in np.linspace(-1, 1, 300):
    # Try both greater and less selection
    metric_greater = sob(signal_vPosMomAngT, ttbar_vPosMomAngT, selection, cut_type="greater")
    metric_less = sob(signal_vPosMomAngT, ttbar_vPosMomAngT, selection, cut_type="less")

    # Pick the best performing cut type
    if metric_greater > best_metric:
        best_metric = metric_greater
        best_selection = selection
        best_cut_type = "greater"

    if metric_less > best_metric:
        best_metric = metric_less
        best_selection = selection
        best_cut_type = "less"

print(f"Best selection: {best_selection} ({best_cut_type})")
print(f"Best S/sqrt(S+B): {best_metric}")

#You'll note here that the best selection is on the signal side of the peak, as expected
#but it's very very restrictive -- so actually cutting on 0.98 might reduce our signal efficiency too much. That's why we need to investigate
#the interplay between other variables -- maybe a more complicated combination of selections will give a better metric

#do this for delta R -- remember here that signal will have a small delta R while ttbar background will have a large delta R
#so this should end up being a less than selection
best_metric = 0
best_cut_type = None
best_selection = None

print("Optimizing selection for maxDeltaR...")

for selection in np.linspace(0, 5, 500):
    metric_greater = sob(signal_maxDeltaR, ttbar_maxDeltaR, selection, cut_type="greater")
    metric_less = sob(signal_maxDeltaR, ttbar_maxDeltaR, selection, cut_type="less")

    if metric_greater > best_metric:
        best_metric = metric_greater
        best_selection = selection
        best_cut_type = "greater"

    if metric_less > best_metric:
        best_metric = metric_less
        best_selection = selection
        best_cut_type = "less"

print(f"Best selection: {best_selection} ({best_cut_type})")
print(f"Best S/sqrt(S+B): {best_metric}")

#you'll see that this ends up selecting events only below 0.4ish, which is what you expect from looking at a plot of these variables, so it's a good sanity check
#but you'll also notice from the first bin of the plot made above that many QCD events actually have very very low dR, moreso than the signal
#so would it be better to make a window? or a special additional selection on very very low dR events? Or do we not have to worry about it and will just cut this population out with other selections so we don't have to worry about it?
#You want to start with the "heavy hitters" -- the variables that you know will give you the most discrimination power, and then move on to the more subtle ones

#do this for Lxy so we have it for the next example
best_metric = 0
best_cut_type = None
best_selection = None

print("Optimizing selection for Lxy...")

for selection in np.linspace(0, 16000, 8000):
    metric_greater = sob(signal_Lxy, ttbar_Lxy, selection, cut_type="greater")
    metric_less = sob(signal_Lxy, ttbar_Lxy, selection, cut_type="less")

    if metric_greater > best_metric:
        best_metric = metric_greater
        best_selection = selection
        best_cut_type = "greater"

    if metric_less > best_metric:
        best_metric = metric_less
        best_selection = selection
        best_cut_type = "less"

print(f"Best selection: {best_selection} ({best_cut_type})")
print(f"Best S/sqrt(S+B): {best_metric}")

#note that here it suggests a cut of > 350mm-ish, which makes sense -- we are trying to select very highly displaced signal
#while background is more likely to be SM prompt physics


#remember that this is just a starting point -- you can make this as complicated as you want!
#for an example, let's say that we accept that we should only keep events with vPosMomAngT > 0.98, but don't want to lose all that signal efficiency from the cut
#we could make a condition on the variable such that we only apply the cut if another variable is in a certain range
#now out of the population of signal and background that fail that value, would selecting on Lxy help us?
#maybe there is an optimal Lxy value that can let us keep events that fail the vPosMomAngT cut but are still signal-like

#make the mask for that selection
maskSignal = ak.any(signalBranches["MuSAvtx_vPosMomAngT"] < 0.98, axis=1)
maskTtbar = ak.any(ttbarBranches["MuSAvtx_vPosMomAngT"] < 0.98, axis=1)

signalBranches_FailVPosMomAngT = signalBranches[maskSignal]
ttbarBranches_FailVPosMomAngT = ttbarBranches[maskTtbar]

#get the new filtered arrays again
signal_vPosMomAngT_Fail = ak.to_numpy(ak.flatten(signalBranches_FailVPosMomAngT["MuSAvtx_vPosMomAngT"]))
ttbar_vPosMomAngT_Fail = ak.to_numpy(ak.flatten(ttbarBranches_FailVPosMomAngT["MuSAvtx_vPosMomAngT"]))
signal_Lxy_Fail = ak.to_numpy(ak.flatten(signalBranches_FailVPosMomAngT["MuSAvtx_Lxy"]))
ttbar_Lxy_Fail = ak.to_numpy(ak.flatten(ttbarBranches_FailVPosMomAngT["MuSAvtx_Lxy"]))

best_metric = 0
best_selection = None
best_cut_type = None

print("Optimizing selection for Lxy for events failing VPosMomAngT...")

for selection in np.linspace(0.01, 16000, 8000):
    metric_greater = sob(signal_Lxy_Fail, ttbar_Lxy_Fail, selection, cut_type="greater")
    metric_less = sob(signal_Lxy_Fail, ttbar_Lxy_Fail, selection, cut_type="less")

    if metric_greater > best_metric:
        best_metric = metric_greater
        best_selection = selection
        best_cut_type = "greater"

    if metric_less > best_metric:
        best_metric = metric_less
        best_selection = selection
        best_cut_type = "less"

print(f"Best selection: {best_selection} ({best_cut_type})")
print(f"Best S/sqrt(S+B): {best_metric}")

#from this you'll see it picks an even less restrictive cut for Lxy for this population, 150mm, which suggests this was a bad idea, at least as the first-line selection without other cuts! This is just for the exercise, we wouldn't want to do this...
#you can also try to optimize the selections on 2D/multiple variables at once -- this is a bit more complicated, but you can use a grid search or a more sophisticated algorithm
#for example let's take a look at Lxy and maxDeltaR -- we know that signal will have a small maxDeltaR and a large Lxy, while background will have a large maxDeltaR and a small Lxy
#so we can try to optimize the selections on both of these variables at once with some sort of fit function

#define a function for Lxy-dependent maxDeltaR cut
def func(lxy, a, b, c):
    return a + b * lxy + c * lxy**2  #quadratic just for example -- you'd want to look at the 2d plot and think about what kind of function would fit the data...

# Fit to signal sample
params, _ = curve_fit(func, signal_Lxy, signal_maxDeltaR, p0=[0.1, -0.00001, 0])

# Apply cut dynamically
mask_signal = signal_maxDeltaR < func(signal_Lxy, *params)
mask_ttbar = ttbar_maxDeltaR < func(ttbar_Lxy, *params)

# Normalize by total counts
total_signal = len(signal_Lxy)
total_ttbar = len(ttbar_Lxy)

# Compute S/sqrt(S+B) for the new dynamic cut
s = np.sum(mask_signal) / total_signal if total_signal > 0 else 0
b = np.sum(mask_ttbar) / total_ttbar if total_ttbar > 0 else 0
sob_metric = s / np.sqrt(s + b) if (s + b) > 0 else 0

print(f"Optimized Lxy-dependent maxDeltaR cut: maxDeltaR < {params[0]} + {params[1]} * Lxy + {params[2]} * Lxy^2")
print(f"S/sqrt(S+B) with new cut: {sob_metric}")

#you'll note from the output too here that this cut is worse than just selecting on maxDeltaR alone -- so this isn't a good idea either, you only want to do this
#if the two variables are correlated in a way that's harder to disentangle than a flat cut -- but see how tricky this can be? :-) To pick out these, you want to investigate correlations with 2D plots
#I am just grabbing things off the top of my head to write this example

#lastly you can also compute a ROC curve and try to take an optimal value from there -- ROC curves are a good way to visualize the tradeoff between signal efficiency and background rejection
#you can also compute the area under the curve (AUC) to get a single number that summarizes the performance of the selection
#a random selection will have an AUC of 0.5 (so a diagonal line), while a perfect selection will have an AUC of 1 (so a line that goes up to 1 and then across to 1 or vice versa)

# Function to compute and plot ROC curves
def compute_roc(signal, ttbar, qcd, bins, variable_name):
    print(f"Computing ROC Curves for {variable_name}...")
    plt.figure()
    signal_hist, _ = np.histogram(signal, bins=bins, density=True)
    ttbar_hist, _ = np.histogram(ttbar, bins=bins, density=True)
    qcd_hist, _ = np.histogram(qcd, bins=bins, density=True)

    signal_eff = np.cumsum(signal_hist) / np.sum(signal_hist)
    ttbar_rej = 1 - np.cumsum(ttbar_hist) / np.sum(ttbar_hist)
    qcd_rej = 1 - np.cumsum(qcd_hist) / np.sum(qcd_hist)

    plt.plot(signal_eff, ttbar_rej, label=f"{variable_name} (ttbar)")
    plt.plot(signal_eff, qcd_rej, label=f"{variable_name} (qcd)")
    plt.xlabel("Signal Efficiency")
    plt.ylabel("Background Rejection")

    plt.legend()
    plt.savefig(f"roc_curve_{variable_name}.png")

    distances_ttbar = signal_eff - (1 - ttbar_rej)
    distances_qcd = signal_eff - (1 - qcd_rej)
    optimal_idx_ttbar = np.argmax(distances_ttbar)
    optimal_idx_qcd = np.argmax(distances_qcd)
    optimal_cut_ttbar = bins[optimal_idx_ttbar]
    optimal_cut_qcd = bins[optimal_idx_qcd]
    return optimal_cut_ttbar, optimal_cut_qcd

# Define bins for each variable
bins_vPosMomAngT = np.linspace(-1, 1, 300)
bins_chi2 = np.linspace(0.01, 50, 100)
bins_maxDeltaR = np.linspace(0, 5, 500)
bins_Lxy = np.linspace(0, 16000, 8000)

# Compute ROC curves and optimal cuts
optimal_cut_vPosMomAngT_ttbar, optimal_cut_vPosMomAngT_qcd = compute_roc(signal_vPosMomAngT, ttbar_vPosMomAngT, qcd_vPosMomAngT, bins_vPosMomAngT, "vPosMomAngT")
optimal_cut_chi2_ttbar, optimal_cut_chi2_qcd = compute_roc(signal_chi2, ttbar_chi2, qcd_chi2, bins_chi2, "chi2")
optimal_cut_maxDeltaR_ttbar, optimal_cut_maxDeltaR_qcd = compute_roc(signal_maxDeltaR, ttbar_maxDeltaR, qcd_maxDeltaR, bins_maxDeltaR, "maxDeltaR")
optimal_cut_Lxy_ttbar, optimal_cut_Lxy_qcd = compute_roc(signal_Lxy, ttbar_Lxy, qcd_Lxy, bins_Lxy, "Lxy")

print(f"Optimal cut for vPosMomAngT (ttbar): {optimal_cut_vPosMomAngT_ttbar}")
print(f"Optimal cut for vPosMomAngT (qcd): {optimal_cut_vPosMomAngT_qcd}")
print(f"Optimal cut for chi2 (ttbar): {optimal_cut_chi2_ttbar}")
print(f"Optimal cut for chi2 (qcd): {optimal_cut_chi2_qcd}")
print(f"Optimal cut for maxDeltaR (ttbar): {optimal_cut_maxDeltaR_ttbar}")
print(f"Optimal cut for maxDeltaR (qcd): {optimal_cut_maxDeltaR_qcd}")
print(f"Optimal cut for Lxy (ttbar): {optimal_cut_Lxy_ttbar}")
print(f"Optimal cut for Lxy (qcd): {optimal_cut_Lxy_qcd}")

#from here, you can notice that chi2 has a very low AUC (almost diagonal line), so it's not a very good discriminator
#while VPosMomAngT has a very high AUC (it's L shaped) so it's a very powerful discriminator
#here the optimization weights the signal efficiency and background rejection equally -- you can also weight these to prefer one or the other!
#particularly you'll notice that the "optimal" cut for Lxy found this way is much much higher than the cut we found earlier -- we generally want to prioritize signal efficiency some over background rejection

#try playing around with everything here, there's no "right" answer, we need to find the best selections and it's good to try lots of things
#once you have a set of selections that you think might be good, apply them successively to the signal and background samples and see how many events you keep
#and see if the remaining populations of background events can be cut down even more with some other quantity etc etc

#good luck!