Compare revisions

9b3a5e03 · 9b3a5e03 · 9b3a5e03 · 9b3a5e03 · 9b3a5e03 · 9b3a5e03
--- a/higgs_dna/scripts/postprocessing/config_jsons/for_HIG-23-014/cat_dict_inclusive_data.json
+++ b/higgs_dna/scripts/postprocessing/config_jsons/for_HIG-23-014/cat_dict_inclusive_data.json
+{
+    "best_resolution": {
+        "cat_filter": [
+            ["sigma_m_over_m_smeared_decorr", "<", 0.0105],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "medium_resolution": {
+        "cat_filter": [
+            ["sigma_m_over_m_smeared_decorr", ">", 0.0105],
+            ["sigma_m_over_m_smeared_decorr", "<", 0.0130],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "worst_resolution": {
+        "cat_filter": [
+            ["sigma_m_over_m_smeared_decorr", ">", 0.0130],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    }
+}
\ No newline at end of file
--- a/higgs_dna/scripts/postprocessing/config_jsons/for_HIG-23-014/var_dict.json
+++ b/higgs_dna/scripts/postprocessing/config_jsons/for_HIG-23-014/var_dict.json
+{
+    "NOMINAL": "nominal",
+    "ScaleEBUp": "Et_dependent_ScaleEB_up",
+    "ScaleEBDown": "Et_dependent_ScaleEB_down",
+    "ScaleEEUp": "Et_dependent_ScaleEE_up",
+    "ScaleEEDown": "Et_dependent_ScaleEE_down",
+    "SmearingUp": "Et_dependent_Smearing_up",
+    "SmearingDown": "Et_dependent_Smearing_down",
+    "MaterialUp": "Material_up", 
+    "MaterialDown": "Material_down",
+    "FNUFUp": "FNUF_up", 
+    "FNUFDown": "FNUF_down",
+    "energyErrShiftUp": "energyErrShift_up",
+    "energyErrShiftDown": "energyErrShift_down"
+}
\ No newline at end of file
--- a/higgs_dna/scripts/postprocessing/config_jsons/simple/eta_cats.json
+++ b/higgs_dna/scripts/postprocessing/config_jsons/simple/eta_cats.json
+{
+    "EBEB_highR9highR9": {
+        "cat_filter": [
+            ["lead_isScEtaEB", "==", true],
+            ["lead_r9", ">=", 0.85],
+            ["sublead_isScEtaEB", "==", true],
+            ["sublead_r9", ">=", 0.85],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "EBEB_highR9lowR9": {
+        "cat_filter": [
+            ["lead_isScEtaEB", "==", true],
+            ["lead_r9", ">=", 0.85],
+            ["sublead_isScEtaEB", "==", true],
+            ["sublead_r9", "<", 0.85],
+            ["sublead_r9", ">=", 0.5],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "EBEB_lowR9highR9": {
+        "cat_filter": [
+            ["lead_isScEtaEB", "==", true],
+            ["lead_r9", "<", 0.85],
+            ["lead_r9", ">=", 0.5],
+            ["sublead_isScEtaEB", "==", true],
+            ["sublead_r9", ">=", 0.85],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "EBEE_highR9highR9": {
+        "cat_filter": [
+            ["lead_isScEtaEB", "==", true],
+            ["lead_r9", ">=", 0.85],
+            ["sublead_isScEtaEE", "==", true],
+            ["sublead_r9", ">=", 0.9],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "EBEE_highR9lowR9": {
+        "cat_filter": [
+            ["lead_isScEtaEB", "==", true],
+            ["lead_r9", ">=", 0.85],
+            ["sublead_isScEtaEE", "==", true],
+            ["sublead_r9", "<", 0.9],
+            ["sublead_r9", ">=", 0.8],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "EBEE_lowR9highR9": {
+        "cat_filter": [
+            ["lead_isScEtaEB", "==", true],
+            ["lead_r9", "<", 0.85],
+            ["lead_r9", ">=", 0.5],
+            ["sublead_isScEtaEE", "==", true],
+            ["sublead_r9", ">=", 0.9],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "EEEB_highR9highR9": {
+        "cat_filter": [
+            ["lead_isScEtaEE", "==", true],
+            ["lead_r9", ">=", 0.9],
+            ["sublead_isScEtaEB", "==", true],
+            ["sublead_r9", ">=", 0.85],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "EEEB_highR9lowR9": {
+        "cat_filter": [
+            ["lead_isScEtaEE", "==", true],
+            ["lead_r9", ">=", 0.9],
+            ["sublead_isScEtaEB", "==", true],
+            ["sublead_r9", "<", 0.85],
+            ["sublead_r9", ">=", 0.5],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "EEEB_lowR9highR9": {
+        "cat_filter": [
+            ["lead_isScEtaEE", "==", true],
+            ["lead_r9", "<", 0.9],
+            ["lead_r9", ">=", 0.8],
+            ["sublead_isScEtaEB", "==", true],
+            ["sublead_r9", ">=", 0.85],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    },
+    "EEEE_incl": {
+        "cat_filter": [
+            ["lead_isScEtaEE", "==", true],
+            ["sublead_isScEtaEE", "==", true],
+            ["lead_mvaID", ">", 0.25],
+            ["sublead_mvaID", ">", 0.25]
+        ]
+    }
+}
\ No newline at end of file
--- a/higgs_dna/scripts/postprocessing/config_jsons/simple/var_dict.json
+++ b/higgs_dna/scripts/postprocessing/config_jsons/simple/var_dict.json
+{
+    "NOMINAL": "nominal",
+    "ScaleUp": "Scale_up",
+    "ScaleDown": "Scale_down",
+    "SmearingUp": "Smearing_up",
+    "SmearingDown": "Smearing_down"
+}
\ No newline at end of file
--- a/higgs_dna/scripts/postprocessing/config_jsons/write_cat_json.py
+++ b/higgs_dna/scripts/postprocessing/config_jsons/write_cat_json.py
+import argparse
+import json
+
+def generate_categories(recoVar, binName, boundaries, isData=False):
+    lead_mvaId_string = "lead_mvaID"
+    sublead_mvaId_string = "sublead_mvaID"
+    sigma_m_over_m_string = "sigma_m_over_m_corr_smeared_decorr"
+    if isData: 
+        sigma_m_over_m_string = "sigma_m_over_m_smeared_decorr"
+        
+    categories = {}
+    mass_resolution_categories = ['cat0', 'cat1', 'cat2']
+    mass_resolution_thresholds = [0.010, 0.014]
+    
+    absolute_value_vars = ["rapidity", "first_jet_y"]
+    
+    for i in range(len(boundaries) - 1):
+        for j, mass_resolution_cat in enumerate(mass_resolution_categories):
+            if binName == "":
+                binName_str = recoVar
+            else:
+                binName_str = binName
+            category_name = f"RECO_{binName_str}_{str(boundaries[i]).replace('.', 'p')}_{str(boundaries[i + 1]).replace('.', 'p')}_{mass_resolution_cat}"
+            if recoVar in absolute_value_vars:
+                # Apply parquet way of handling logic AND and OR
+                # Outer list is always OR while inner lists are AND conditions
+                if boundaries[i] != 0:
+                    category_filters = [
+                        [
+                            [lead_mvaId_string, ">", 0.25],
+                            [sublead_mvaId_string, ">", 0.25],
+                            [recoVar, ">=", boundaries[i]],
+                            [recoVar, "<", boundaries[i + 1]]
+                    ],
+                    [
+                        [lead_mvaId_string, ">", 0.25],
+                        [sublead_mvaId_string, ">", 0.25],
+                        [recoVar, "<=", -1 * boundaries[i]],
+                        [recoVar, ">", -1 * boundaries[i + 1]]
+                    ]
+                ]
+                
+                else:
+                    category_filters = [
+                        [
+                            [lead_mvaId_string, ">", 0.25],
+                            [sublead_mvaId_string, ">", 0.25], 
+                            [recoVar, ">=", boundaries[i]],
+                            [recoVar, "<", boundaries[i + 1]]
+                        ],
+                        [
+                            [lead_mvaId_string, ">", 0.25],
+                            [sublead_mvaId_string, ">", 0.25], 
+                            [recoVar, "<", boundaries[i]],
+                            [recoVar, ">=", -1 * boundaries[i + 1]]
+                        ]
+                ]
+            
+            
+            
+                for k, _ in enumerate(category_filters):
+                    if j == 0:
+                        category_filters[k].append([sigma_m_over_m_string, "<", mass_resolution_thresholds[j]])
+                    elif j == len(mass_resolution_categories) - 1:
+                        category_filters[k].append([sigma_m_over_m_string, ">=", mass_resolution_thresholds[j - 1]])
+                    else:
+                        category_filters[k].append([sigma_m_over_m_string, ">=", mass_resolution_thresholds[j - 1]])
+                        category_filters[k].append([sigma_m_over_m_string, "<", mass_resolution_thresholds[j]])
+                    
+            else:
+                category_filters = [
+                    [lead_mvaId_string, ">", 0.25],
+                    [sublead_mvaId_string, ">", 0.25],
+                    [recoVar, ">=", boundaries[i]],
+                    [recoVar, "<", boundaries[i + 1]]
+                ]
+                if j == 0:
+                    category_filters.append([sigma_m_over_m_string, "<", mass_resolution_thresholds[j]])
+                elif j == len(mass_resolution_categories) - 1:
+                    category_filters.append([sigma_m_over_m_string, ">=", mass_resolution_thresholds[j - 1]])
+                else:
+                    category_filters.append([sigma_m_over_m_string, ">=", mass_resolution_thresholds[j - 1]])
+                    category_filters.append([sigma_m_over_m_string, "<", mass_resolution_thresholds[j]])
+            
+            categories[category_name] = {
+                "cat_filter": category_filters
+            }
+    return categories
+
+def save_to_json(output_file, categories):
+    with open(output_file, 'w') as outfile:
+        json.dump(categories, outfile, indent=4)
+
+def main():
+    parser = argparse.ArgumentParser(description='Generate JSON file with specified reco variable boundaries.')
+    parser.add_argument('output_file', help='Output file name and location')
+    parser.add_argument('recoVar', type=str, default="pt", help="Reco variable")
+    parser.add_argument('boundaries', nargs='+', type=float, help='Boundaries list')
+    parser.add_argument('--isData', action="store_true", default=False, help="Add this flag if you are running over data, this changes the name of the sigma_m/m variable that is read.")
+    parser.add_argument("--binName", dest="binName", type=str, required=False, default="")
+    args = parser.parse_args()
+
+    print(args.isData)
+
+    categories = generate_categories(args.recoVar, args.binName, args.boundaries, args.isData)
+    save_to_json(args.output_file, categories)
+
+if __name__ == "__main__":
+    main()
--- a/higgs_dna/scripts/postprocessing/convert_parquet_to_root.py
+++ b/higgs_dna/scripts/postprocessing/convert_parquet_to_root.py
--- a/higgs_dna/scripts/postprocessing/merge_parquet.py
+++ b/higgs_dna/scripts/postprocessing/merge_parquet.py
--- a/higgs_dna/scripts/postprocessing/prepare_output_file.py
+++ b/higgs_dna/scripts/postprocessing/prepare_output_file.py
--- a/scripts/postprocessing/prepare_root_for_tree2ws_diff.py
+++ b/scripts/postprocessing/prepare_root_for_tree2ws_diff.py
--- a/higgs_dna/scripts/postprocessing/remote/htcondor.py
+++ b/higgs_dna/scripts/postprocessing/remote/htcondor.py
--- a/higgs_dna/scripts/postprocessing/remote/slurm.py
+++ b/higgs_dna/scripts/postprocessing/remote/slurm.py
--- a/higgs_dna/scripts/postprocessing/sample_gen_binning.json
+++ b/higgs_dna/scripts/postprocessing/sample_gen_binning.json
+{
+    "GenPTH":{
+        "(0, 15, 'in', '')": 10,
+        "(15, 30, 'in', '')": 11,
+        "(30, 45, 'in', '')": 12,
+        "(45, 80, 'in', '')": 13,
+        "(80, 120, 'in', '')": 14,
+        "(120, 200, 'in', '')": 15,
+        "(200, 350, 'in', '')": 16,
+        "(350, 10000, 'in', '')": 17,
+        "(0, 10000, 'out', '')": 18
+    }
+}
\ No newline at end of file
--- a/higgs_dna/scripts/postprocessing/tools/Btag_WeightSum_Calculation.py
+++ b/higgs_dna/scripts/postprocessing/tools/Btag_WeightSum_Calculation.py
+import glob
+import awkward as ak
+import pyarrow.parquet as pq
+
+
+def Get_WeightSum_Btag(source_paths,logger):
+    # All systematic variation list
+    bTag_sys_variation = ['lfstats1', 'hfstats2' , 'jes', 'cferr2', 'lf', 'hf', 'lfstats2', 'hfstats1', 'cferr1']
+    sum_weight_central_arr, sum_weight_central_wo_bTagSF_arr = [], []
+    sum_weight_bTagSF_sys_arr = []
+    flag_bWeight_sys_array = []
+    
+
+
+    for i, source_path in enumerate(source_paths):
+        # create array to store the sum of the weights for all systematic vartions
+        dataset_check_fields = ak.from_parquet(glob.glob("%s/*.parquet" % source_path)[0])
+        # check if systamtic vatiation are stored by acessing one field of the parquet file
+        flag_bWeight_sys = "weight_bTagSF_sys_jesDown" in dataset_check_fields.fields
+        del dataset_check_fields
+        if (flag_bWeight_sys):
+            logger.info(
+                f"Attampeting Extracting sum of central weights and bweight systematics from metadata of files to be merged from {source_path}"
+            )
+        else:
+            logger.info(
+                "Skiping the renormalization of b-tagging systematic weights. Please check if you have stored the weights for bTag systematic variation. Dont worry if you are not evaluating btaging systematic for now"
+            )
+        source_files = glob.glob("%s/*.parquet" % source_path)
+        sum_weight_central,sum_weight_central_wo_bTagSF = 0,0
+        sum_weight_bTagSF_sys_dct = {}
+        if (flag_bWeight_sys):
+            # dictionory to store up and down variation together
+            for numSys in range(0, len(bTag_sys_variation)):
+                sum_weight_bTagSF_sys_dct["sum_weight_bTagSF_" + bTag_sys_variation[numSys] + "Up"] = 0
+                sum_weight_bTagSF_sys_dct["sum_weight_bTagSF_" + bTag_sys_variation[numSys] + "Down"] = 0
+
+        for f in source_files:
+            try:
+                # read the sum of the weights from metadata without any systematic variation
+                sum_weight_central += float(pq.read_table(f).schema.metadata[b'sum_weight_central'])
+                sum_weight_central_wo_bTagSF += float(pq.read_table(f).schema.metadata[b'sum_weight_central_wo_bTagSF'])
+            except:
+                logger.info(
+                    "Skiping the renormalization of weights from b-tagging systematics. Please check if you have stored sum of the weights after applying the b-weight systematics in the metadata with proper naming. Example: sum_weight_bTagSF_jesUp, sum_weight_bTagSF_jesDown."
+                )
+                # return sum of the weights before and after b-weight to 1 so that the ration will be one  and merge_parquet.py will not process renormalization
+                sum_weight_central,sum_weight_central_wo_bTagSF = 1.0,1.0 
+            if (flag_bWeight_sys):
+                for numSys in range(0, len(bTag_sys_variation)):
+                    try:
+                        # read the sum of the weights from metadata for all systematic variation
+                        sum_weight_bTagSF_sys_dct["sum_weight_bTagSF_" + bTag_sys_variation[numSys] + "Up"] += float(pq.read_table(f).schema.metadata[bytes('sum_weight_bTagSF_sys_' + bTag_sys_variation[numSys] + 'Up',encoding='utf8')])
+                        sum_weight_bTagSF_sys_dct["sum_weight_bTagSF_" + bTag_sys_variation[numSys] + "Down"] += float(pq.read_table(f).schema.metadata[bytes('sum_weight_bTagSF_sys_' + bTag_sys_variation[numSys] + 'Down',encoding='utf8')])
+                    except:
+                        logger.info(
+                            "Skiping the renormalization of weights from btagging systematics. Please check if you have stored sum of the weights after appling the bweight systematics in the metadata with proper nameing : example: sum_weight_bTagSF_jesUp, sum_weight_bTagSF_jesDown"
+                        )
+                        flag_bWeight_sys = False
+                        break
+
+        sum_weight_central_arr.append(sum_weight_central)
+        sum_weight_central_wo_bTagSF_arr.append(sum_weight_central_wo_bTagSF)
+
+        flag_bWeight_sys_array.append(flag_bWeight_sys)
+        sum_weight_bTagSF_sys_arr.append(sum_weight_bTagSF_sys_dct)
+        
+    logger.info(
+        "Successfully extracted sum of weights with and without b-tag weights."
+    )
+    if (flag_bWeight_sys):
+        logger.info(
+            "Successfully extracted sum of systematic weights with and without b-tag SF"
+        )
+
+    IsBtagNorm_sys_arr,WeightSum_preBTag_arr,WeightSum_postBTag_arr,dir_WeightSum_postBTag_sys_arr = flag_bWeight_sys_array, sum_weight_central_wo_bTagSF_arr, sum_weight_central_arr,sum_weight_bTagSF_sys_arr
+    return IsBtagNorm_sys_arr,WeightSum_preBTag_arr,WeightSum_postBTag_arr,dir_WeightSum_postBTag_sys_arr
+
+def Renormalize_BTag_Weights(dataset,target_path,cat,WeightSum_preBTag,WeightSum_postBTag,WeightSum_postBTag_sys,IsBtagNorm_sys,logger):
+    bTag_sys_variation = ['lfstats1', 'hfstats2' , 'jes', 'cferr2', 'lf', 'hf', 'lfstats2', 'hfstats1', 'cferr1']
+    logger.info(
+        f"Attempting to renormalize the weights wrt no b-tag SF from {target_path}{cat}_merged.parquet"
+    )
+    # Modify existing column
+
+    if (WeightSum_preBTag != 0 and WeightSum_postBTag != 0):
+        dataset['weight'] = dataset['weight'] * (WeightSum_preBTag / WeightSum_postBTag)
+        logger.info(
+            f"Successfully renormalised weights wrt no b-tag SF from {target_path}{cat}_merged.parquet"
+        )
+    else:
+        logger.info(
+            f"Skipping weights renormalisation wrt No bTagSF from {target_path}{cat}_merged.parquet"
+        )
+    if (IsBtagNorm_sys):
+        for numSys in range(0, len(bTag_sys_variation)):
+            dataset['weight_bTagSF_sys_' + bTag_sys_variation[numSys] + 'Up'] = dataset['weight_bTagSF_sys_' + bTag_sys_variation[numSys] + 'Up'] * (WeightSum_preBTag / WeightSum_postBTag_sys["sum_weight_bTagSF_" + bTag_sys_variation[numSys] + "Up"])
+            dataset['weight_bTagSF_sys_' + bTag_sys_variation[numSys] + 'Down'] = dataset['weight_bTagSF_sys_' + bTag_sys_variation[numSys] + 'Down'] * (WeightSum_preBTag / WeightSum_postBTag_sys["sum_weight_bTagSF_" + bTag_sys_variation[numSys] + "Down"])
+        logger.info(
+            f"Successfully renormalised weights wrt no b-tag SF from {target_path}{cat}_merged.parquet"
+                )
+    else:
+        logger.info(
+            f"Skipping systematic weights renormalisation wrt no b-tag SF from {target_path}{cat}_merged.parquet"
+        )
+    return dataset
--- a/higgs_dna/scripts/postprocessing/tools/__init__.py
+++ b/higgs_dna/scripts/postprocessing/tools/__init__.py
--- a/scripts/pull_files.py
+++ b/scripts/pull_files.py
--- a/scripts/run_analysis.py
+++ b/scripts/run_analysis.py
--- a/higgs_dna/scripts/samples/__init__.py
+++ b/higgs_dna/scripts/samples/__init__.py
--- a/higgs_dna/scripts/samples/download_files.py
+++ b/higgs_dna/scripts/samples/download_files.py
--- a/higgs_dna/scripts/samples/fetch_datasets.py
+++ b/higgs_dna/scripts/samples/fetch_datasets.py
--- a/higgs_dna/selections/HHbbgg_selections.py
+++ b/higgs_dna/selections/HHbbgg_selections.py
No results found