From 256575e7f32f8e09e791f6702d53be6327cde0e8 Mon Sep 17 00:00:00 2001 From: Devin Date: Wed, 17 Jun 2026 20:46:08 +0200 Subject: [PATCH 1/2] Update dc make to handle meta eras --- dc_make/maker.py | 1316 ++++++++++++++++++++++++++-------------- dc_make/uncertainty.py | 638 ++++++++++--------- 2 files changed, 1235 insertions(+), 719 deletions(-) diff --git a/dc_make/maker.py b/dc_make/maker.py index a439c26..ae1fa61 100644 --- a/dc_make/maker.py +++ b/dc_make/maker.py @@ -5,462 +5,898 @@ from CombineHarvester.CombineTools.ch import CombineHarvester -from StatInference.common.tools import listToVector, rebinAndFill, importROOT, resolveNegativeBins, getRelevantBins +from StatInference.common.tools import ( + listToVector, + rebinAndFill, + importROOT, + resolveNegativeBins, + getRelevantBins, +) from .process import Process -from .uncertainty import Uncertainty, UncertaintyType, UncertaintyScale, MultiValueLnNUncertainty +from .uncertainty import ( + Uncertainty, + UncertaintyType, + UncertaintyScale, + MultiValueLnNUncertainty, + ShapeUncertainty, +) from .model import Model from .binner import Binner + ROOT = importROOT() class DatacardMaker: - customizeble_parameters = ["eras", "channels", "categories"] - - def __init__(self, cfg_file, input_path, hist_bins=None, param_values=None, **kwargs): - self.cb = CombineHarvester() - - self.input_path = input_path - with open(cfg_file, 'r') as f: - cfg = yaml.safe_load(f) - for param in self.customizeble_parameters: - if param not in kwargs: continue - value = kwargs[param] - if value is None: continue - value_type = type(cfg[param]) - if value_type == list: - value = value.split(',') - else: - value = value_type(value) - print(f"Overriding config parameter {param} to {value}") - cfg[param] = value - - self.analysis = cfg["analysis"] - self.eras = cfg["eras"] - self.channels = cfg["channels"] - self.categories = cfg["categories"] - self.signalFractionForRelevantBins = cfg['signalFractionForRelevantBins'] - - - self.bins = [] - for era, channel, cat in self.ECC(): - bin = self.getBin(era, channel, cat, return_index=False) - self.bins.append(bin) - - self.model = Model.fromConfig(cfg["model"]) - self.keep_all_signal_hypothesis_into_single_datacard = cfg.get("keep_all_signal_hypothesis_into_single_datacard", False) - self.param_bins = {} - self.processes = {} - self.param_of = {} - self.base_of = {} - data_process = None - has_signal = False - self.channel_processes = {} - for channel in self.channels: - self.channel_processes[channel] = [] - - for process in cfg["processes"]: - if (type(process) != str) and process.get('is_signal', False): - if param_values is not None: - print(f"Overwriting signal parameters to {param_values}") - process['param_values'] = param_values - new_processes = Process.fromConfig(process, self.model) - for process in new_processes: - if process.name in self.processes: - raise RuntimeError(f"Process name {process.name} already exists") - print(f"Adding {process}") - self.processes[process.name] = process - if process.channels: - for channel in process.channels: - if channel not in self.channel_processes: - print(f"Channel {channel} not defined in config") - continue - self.channel_processes[channel].append(process.name) - else: - for channel in self.channels: - self.channel_processes[channel].append(process.name) - if process.is_data: - if data_process is not None: - raise RuntimeError("Multiple data processes defined") - data_process = process - if process.is_signal: - has_signal = True - self.param_bins.setdefault(self.model.paramStr(process.params), process.params) - if data_process is None: - raise RuntimeError("No data process defined") - if not has_signal: - raise RuntimeError("No signal process defined") - - self.uncertainties = {} - for unc_entry in cfg["uncertainties"]: - unc = Uncertainty.fromConfig(unc_entry) - if unc.name in self.uncertainties: - raise RuntimeError(f"Uncertainty {unc.name} already exists") - self.uncertainties[unc.name] = unc - - self.autolnNThr = cfg.get("autolnNThr", 0.05) - self.asymlnNThr = cfg.get("asymlnNThr", 0.001) - self.ignorelnNThr = cfg.get("ignorelnNThr", 0.001) - - self.autoMCStats = cfg.get("autoMCStats", { 'apply': False }) - - - hist_bins = hist_bins or cfg.get("hist_bins", None) - self.hist_binner = Binner(hist_bins) - # print(f"Using hist_bins: {self.hist_binner.hist_bins}") - - self.input_files = {} - self.shapes = {} - self.signal_hists_by_key = {} - - - def getBin(self, era, channel, category, return_name=True, return_index=True): - name = f'{era}_{self.analysis}_{channel}_{category}' - if not return_name and not return_index: - raise RuntimeError("Invalid argument combination") - if not return_index: - return name - index = self.bins.index(name) - if not return_name: - return index - return (index, name) - - def cbCopy(self, param_str, process, era, channel, category): - bin_idx, bin_name = self.getBin(era, channel, category) - return self.cb.cp().mass([param_str]).process([process]).bin([bin_name]) - - def ECC(self): - return itertools.product(self.eras, self.channels, self.categories) - - def PPECC(self): - param_bins = list(self.param_bins.keys()) - if not self.model.param_dependent_bkg: - param_bins.append("*") - return itertools.product(self.processes.keys(), param_bins, self.eras, self.channels, self.categories) - - def getInputFile(self, era, model_params): - file_name = self.model.getInputFileName(era, model_params) - if file_name not in self.input_files: - full_file_name = os.path.join(self.input_path, file_name) - file = ROOT.TFile.Open(full_file_name, "READ") - if file == None: - raise RuntimeError(f"Cannot open file {full_file_name}") - self.input_files[file_name] = file - return file_name, self.input_files[file_name] - - - def getMultiValueLnUnc(self,unc,unc_name, process, era, channel, category, model_params):#, unc_name=None, unc_scale=None) - file_name, file = self.getInputFile(era, model_params) - hist_name = f"{channel}/{category}/{process.hist_name}" - if unc.getUncertaintyForProcess(process.name) != None: - return unc.getUncertaintyForProcess(process.name) - elif process.subprocesses: - unc_value_tot_down = 0. - unc_value_tot_up = 0. - yield_value_tot = 0. - for subp in process.subprocesses: - hist_name = f"{channel}/{category}/{subp}" - subhist = file.Get(hist_name) - #newhist = self.hist_binner.applyBinning(era, channel, category, model_params, subhist) - if subhist == None: - raise RuntimeError(f"Cannot find histogram {hist_name} in {file.GetName()}") - axis = subhist.GetXaxis() - yield_subproc = subhist.Integral(1,axis.GetNbins() + 1) - unc_value = unc.getUncertaintyForProcess(subp) - if unc_value != None: - if yield_subproc == 0 : continue - # print(unc_value) - if isinstance(unc_value, dict): - unc_value_tot_up += unc_value[UncertaintyScale.Up]*yield_subproc - unc_value_tot_down += unc_value[UncertaintyScale.Down]*yield_subproc - else: - unc_value_tot_up += unc_value*yield_subproc - unc_value_tot_down -= unc_value*yield_subproc - yield_value_tot+=yield_subproc - if unc_value_tot_up != 0. and unc_value_tot_down !=0 : - return {UncertaintyScale.Down: unc_value_tot_down/yield_value_tot, UncertaintyScale.Up: unc_value_tot_up/yield_value_tot} - return None - return None - - - def getShape(self, process, era, channel, category, model_params, unc_name=None, unc_scale=None): - file_name, file = self.getInputFile(era, model_params) - key = (file_name, process.name, era, channel, category, unc_name, unc_scale) - if key not in self.shapes: - if process.is_data and (unc_name is not None or unc_scale is not None): - raise RuntimeError("Cannot apply uncertainty to the data process") - if process.is_asimov_data: - hist = None - for bkg_proc in self.processes.values(): - if bkg_proc.is_background: - if bkg_proc.name not in self.channel_processes[channel]: - continue - bkg_hist = self.getShape(bkg_proc, era, channel, category, model_params) - if hist is None: - hist = bkg_hist.Clone() + customizeble_parameters = ["eras", "channels", "categories"] + + def __init__( + self, cfg_file, input_path, hist_bins=None, param_values=None, **kwargs + ): + self.cb = CombineHarvester() + + self.input_path = input_path + with open(cfg_file, "r") as f: + cfg = yaml.safe_load(f) + for param in self.customizeble_parameters: + if param not in kwargs: + continue + value = kwargs[param] + if value is None: + continue + value_type = type(cfg[param]) + if value_type == list: + value = value.split(",") else: - hist.Add(bkg_hist) - if hist is None: - raise RuntimeError("Cannot create asimov data histogram") - else: - hist_name = f"{channel}/{category}/{process.hist_name}" - hists = [] - if process.subprocesses: - for subp in process.subprocesses: - hist_name = f"{channel}/{category}/{subp}" - if unc_name and unc_scale: - hist_name += f"_{unc_name}_{unc_scale}" - subhist = file.Get(hist_name) - if subhist == None: - raise RuntimeError(f"Cannot find histogram {hist_name} in {file.GetName()}") - hists.append(self.hist_binner.applyBinning(era, channel, category, model_params, subhist)) + value = value_type(value) + print(f"Overriding config parameter {param} to {value}") + cfg[param] = value + + self.analysis = cfg["analysis"] + self.eras = cfg["eras"] + self.channels = cfg["channels"] + self.categories = cfg["categories"] + self.signalFractionForRelevantBins = cfg["signalFractionForRelevantBins"] + + # Load era groups if defined (e.g., Early_Run3 combining multiple eras) + self.era_groups = cfg.get("era_groups", {}) + # Create reverse mapping: sub_era -> meta_era + self.era_to_group = {} + for group_name, sub_eras in self.era_groups.items(): + for sub_era in sub_eras: + if sub_era not in self.era_to_group: + self.era_to_group[sub_era] = [] + self.era_to_group[sub_era].append(group_name) + + # Expand eras to include both regular eras and meta-eras + all_eras = list(self.eras) + for group_name in self.era_groups.keys(): + if group_name not in all_eras: + all_eras.append(group_name) + self.all_eras = all_eras + + self.bins = [] + for era, channel, cat in self.ECC(): + bin = self.getBin(era, channel, cat, return_index=False) + self.bins.append(bin) + + self.model = Model.fromConfig(cfg["model"]) + self.keep_all_signal_hypothesis_into_single_datacard = cfg.get( + "keep_all_signal_hypothesis_into_single_datacard", False + ) + self.param_bins = {} + self.processes = {} + self.param_of = {} + self.base_of = {} + data_process = None + has_signal = False + self.channel_processes = {} + for channel in self.channels: + self.channel_processes[channel] = [] + + for process in cfg["processes"]: + if (type(process) != str) and process.get("is_signal", False): + if param_values is not None: + print(f"Overwriting signal parameters to {param_values}") + process["param_values"] = param_values + new_processes = Process.fromConfig(process, self.model) + for process in new_processes: + if process.name in self.processes: + raise RuntimeError(f"Process name {process.name} already exists") + print(f"Adding {process}") + self.processes[process.name] = process + if process.channels: + for channel in process.channels: + if channel not in self.channel_processes: + print(f"Channel {channel} not defined in config") + continue + self.channel_processes[channel].append(process.name) + else: + for channel in self.channels: + self.channel_processes[channel].append(process.name) + if process.is_data: + if data_process is not None: + raise RuntimeError("Multiple data processes defined") + data_process = process + if process.is_signal: + has_signal = True + self.param_bins.setdefault( + self.model.paramStr(process.params), process.params + ) + if data_process is None: + raise RuntimeError("No data process defined") + if not has_signal: + raise RuntimeError("No signal process defined") + + self.uncertainties = {} + for unc_entry in cfg["uncertainties"]: + unc = Uncertainty.fromConfig(unc_entry) + if unc.name in self.uncertainties: + raise RuntimeError(f"Uncertainty {unc.name} already exists") + self.uncertainties[unc.name] = unc + + self.autolnNThr = cfg.get("autolnNThr", 0.05) + self.asymlnNThr = cfg.get("asymlnNThr", 0.001) + self.ignorelnNThr = cfg.get("ignorelnNThr", 0.001) + + self.autoMCStats = cfg.get("autoMCStats", {"apply": False}) + + hist_bins = hist_bins or cfg.get("hist_bins", None) + self.hist_binner = Binner(hist_bins) + # print(f"Using hist_bins: {self.hist_binner.hist_bins}") + + self.input_files = {} + self.shapes = {} + self.signal_hists_by_key = {} + + def getBin(self, era, channel, category, return_name=True, return_index=True): + name = f"{era}_{self.analysis}_{channel}_{category}" + if not return_name and not return_index: + raise RuntimeError("Invalid argument combination") + if not return_index: + return name + index = self.bins.index(name) + if not return_name: + return index + return (index, name) + + def getSubEras(self, era): + """Get sub-eras for a given era. If era is a meta-era, return its sub-eras. + Otherwise return [era].""" + if era in self.era_groups: + return self.era_groups[era] + return [era] + + def isMetaEra(self, era): + """Check if era is a meta-era.""" + return era in self.era_groups + + def cbCopy(self, param_str, process, era, channel, category): + bin_idx, bin_name = self.getBin(era, channel, category) + return self.cb.cp().mass([param_str]).process([process]).bin([bin_name]) + + def ECC(self): + return itertools.product(self.all_eras, self.channels, self.categories) + + def PPECC(self): + param_bins = list(self.param_bins.keys()) + if not self.model.param_dependent_bkg: + param_bins.append("*") + return itertools.product( + self.processes.keys(), + param_bins, + self.all_eras, + self.channels, + self.categories, + ) + + def getInputFile(self, era, model_params): + file_name = self.model.getInputFileName(era, model_params) + if file_name not in self.input_files: + full_file_name = os.path.join(self.input_path, file_name) + file = ROOT.TFile.Open(full_file_name, "READ") + if file == None: + raise RuntimeError(f"Cannot open file {full_file_name}") + self.input_files[file_name] = file + return file_name, self.input_files[file_name] + + def _getLnNValue(self, unc, process, proc_name_for_unc, sub_era, channel, category): + if isinstance(unc, MultiValueLnNUncertainty): + return unc.getUncertaintyForProcess( + proc_name_for_unc, sub_era, channel, category + ) + if unc.appliesTo(process, sub_era, channel, category): + return unc.value + return None + + def _applyLnNToHist(self, hist, unc_value, direction): + scaled = hist.Clone() + if isinstance(unc_value, dict): + factor = 1 + unc_value[direction] + elif direction == UncertaintyScale.Up: + factor = 1 + unc_value else: - if unc_name and unc_scale: - hist_name += f"_{unc_name}_{unc_scale}" - hist = file.Get(hist_name) - if hist == None: + factor = 1 - unc_value + scaled.Scale(factor) + scaled.SetDirectory(0) + return scaled + + def _loadBinnedHist(self, file, era, channel, category, model_params, hist_name): + hist = file.Get(hist_name) + if hist is None: raise RuntimeError(f"Cannot find histogram {hist_name} in {file.GetName()}") - hists.append(self.hist_binner.applyBinning(era, channel, category, model_params, hist)) - if len(hists) == 0: - raise RuntimeError(f"hist list is empty for file {file.GetName()}") - hist = hists[0] - if len(hists)>1: - for histy in hists[1:]: - hist.Add(histy) - - hist.SetDirectory(0) - if process.scale != 1: - hist.Scale(process.scale) - - if process.is_signal and not (unc_name and unc_scale): - param_str = self.model.paramStr(model_params) if model_params else '*' - key_sig = (era, channel, category, param_str if not self.keep_all_signal_hypothesis_into_single_datacard else '*') - self.signal_hists_by_key.setdefault(key_sig, []).append(hist) - else: - param_str = self.model.paramStr(model_params) if model_params else '*' - key_sig = (era, channel, category, param_str if not self.keep_all_signal_hypothesis_into_single_datacard else '*') - signals = self.signal_hists_by_key.get(key_sig, []) - relevant_bins = getRelevantBins(era, channel, category, signals, self.signalFractionForRelevantBins) - solution = resolveNegativeBins(hist,relevant_bins=relevant_bins,allow_zero_integral=process.allow_zero_integral,allow_negative_bins_within_error=process.allow_negative_bins_within_error,max_n_sigma_for_negative_bins=process.max_n_sigma_for_negative_bins,allow_negative_integral=process.allow_negative_integral) - - if not solution.accepted: - axis = hist.GetXaxis() - bins_edges = [ str(axis.GetBinLowEdge(n)) for n in range(1, axis.GetNbins() + 2)] - bin_values = [ str(hist.GetBinContent(n)) for n in range(1, axis.GetNbins() + 1)] - bin_errors = [ str(hist.GetBinError(n)) for n in range(1, axis.GetNbins() + 1)] - print(f'bins_edges: [ {", ".join(bins_edges)} ]') - print(f'bin_values: [ {", ".join(bin_values)} ]') - print(f'bin_errors: [ {", ".join(bin_errors)} ]') - raise RuntimeError( - f"Negative bins found in histogram for {channel}/{category}/{process.hist_name}" - + (f" (syst {unc_name}{unc_scale})" if unc_name and unc_scale else "") + binned = self.hist_binner.applyBinning( + era, channel, category, model_params, hist + ) + binned.SetDirectory(0) + return binned + + def _getSubEraLnNVariedShapes( + self, unc, process, sub_era, channel, category, model_params + ): + if process.subprocesses and isinstance(unc, MultiValueLnNUncertainty): + file_name, file = self.getInputFile(sub_era, model_params) + up_hist = None + down_hist = None + applies = False + + for subp in process.subprocesses: + hist = self._loadBinnedHist( + file, + sub_era, + channel, + category, + model_params, + f"{channel}/{category}/{subp}", + ) + unc_value = self._getLnNValue( + unc, process, subp, sub_era, channel, category + ) + if unc_value is not None: + applies = True + sub_up = self._applyLnNToHist(hist, unc_value, UncertaintyScale.Up) + sub_down = self._applyLnNToHist( + hist, unc_value, UncertaintyScale.Down + ) + else: + sub_up = hist.Clone() + sub_down = hist.Clone() + sub_up.SetDirectory(0) + sub_down.SetDirectory(0) + + if up_hist is None: + up_hist = sub_up + down_hist = sub_down + else: + up_hist.Add(sub_up) + down_hist.Add(sub_down) + + if process.scale != 1: + up_hist.Scale(process.scale) + down_hist.Scale(process.scale) + return up_hist, down_hist, applies + + nominal_sub = self.getShape(process, sub_era, channel, category, model_params) + unc_value = self._getLnNValue( + unc, process, process.name, sub_era, channel, category + ) + if unc_value is None: + up_hist = nominal_sub.Clone() + down_hist = nominal_sub.Clone() + up_hist.SetDirectory(0) + down_hist.SetDirectory(0) + return up_hist, down_hist, False + + up_hist = self._applyLnNToHist(nominal_sub, unc_value, UncertaintyScale.Up) + down_hist = self._applyLnNToHist(nominal_sub, unc_value, UncertaintyScale.Down) + return up_hist, down_hist, True + + def getMetaEraLnNShapeUnc(self, unc, process, era, channel, category, model_params): + if not self.isMetaEra(era): + return None + + nominal_shape = self.getShape(process, era, channel, category, model_params) + combined_up = None + combined_down = None + any_applies = False + + for sub_era in self.getSubEras(era): + up, down, applies = self._getSubEraLnNVariedShapes( + unc, process, sub_era, channel, category, model_params + ) + if applies: + any_applies = True + if combined_up is None: + combined_up = up.Clone() + combined_down = down.Clone() + else: + combined_up.Add(up) + combined_down.Add(down) + + if not any_applies: + return None + return nominal_shape, { + UncertaintyScale.Up: combined_up, + UncertaintyScale.Down: combined_down, + } + + def _canIgnoreLnNShape(self, nominal_shape, shapes): + nom_int = nominal_shape.Integral() + if nom_int == 0: + return True + up_frac = (shapes[UncertaintyScale.Up].Integral() - nom_int) / nom_int + down_frac = (shapes[UncertaintyScale.Down].Integral() - nom_int) / nom_int + return abs(up_frac) < self.ignorelnNThr and abs(down_frac) < self.ignorelnNThr + + def _addMetaEraLnNAsShapeUnc( + self, unc_name, proc, param_str, process, era, channel, category, model_params + ): + unc = self.uncertainties[unc_name] + shape_result = self.getMetaEraLnNShapeUnc( + unc, process, era, channel, category, model_params + ) + if shape_result is None: + return False + nominal_shape, shapes = shape_result + if self._canIgnoreLnNShape(nominal_shape, shapes): + print( + f"Ignoring uncertainty {unc_name} for {proc} in {era} {channel} {category}" ) - self.shapes[key] = hist - return self.shapes[key] - - - - - def addProcess(self, proc, era, channel, category): - bin_idx, bin_name = self.getBin(era, channel, category) - process = self.processes[proc] - def add(model_params, param_str, process_name): - if process.is_data: - self.cb.AddObservations([param_str], [self.analysis], [era], [channel], [(bin_idx, bin_name)]) - else: - self.cb.AddProcesses([param_str], [self.analysis], [era], [channel], [process_name], [(bin_idx, bin_name)], process.is_signal) - - shape = self.getShape(process, era, channel, category, model_params) - shape_set = False - def setShape(p): - nonlocal shape_set - print(f"Setting shape for {p}") - if shape_set: - raise RuntimeError("Shape already set") - p.set_shape(shape, True) - shape_set = True - cb_copy = self.cbCopy(param_str, process_name, era, channel, category) - if process.is_data: - cb_copy.ForEachObs(setShape) - else: - cb_copy.ForEachProc(setShape) - - if process.is_signal: - model_params = process.params - param_str = self.model.paramStr(model_params) - if self.keep_all_signal_hypothesis_into_single_datacard: - actual_proc_name = f"{process.name}_{param_str}" - add(model_params, '*', actual_proc_name) - self.param_of[('*', actual_proc_name)] = model_params - self.base_of[actual_proc_name] = process.name - else: - actual_proc_name = process.name - add(model_params, param_str, actual_proc_name) - self.param_of[(param_str, actual_proc_name)] = model_params - self.base_of[actual_proc_name] = process.name - - - elif self.model.param_dependent_bkg: - for signal_proc in self.processes.values(): - if not signal_proc.is_signal: continue - model_params = signal_proc.params - param_str = self.model.paramStr(model_params) if not self.keep_all_signal_hypothesis_into_single_datacard else '*' - add(model_params, param_str, proc) - self.param_of[(param_str, proc)] = model_params - self.base_of[proc] = proc - else: - add(None, "*", proc) - - def addUncertainty(self, unc_name): - unc = self.uncertainties[unc_name] - isMVLnUnc = isinstance(unc, MultiValueLnNUncertainty) - - for proc, param_str, era, channel, category in self.PPECC(): - if proc not in self.channel_processes[channel]: - continue - process = self.processes[proc] - if process.is_data: continue - model_params = self.param_bins.get(param_str, None) - if isMVLnUnc: - unc_value = self.getMultiValueLnUnc(unc,unc_name,process, era, channel, category, model_params)#, unc_name=None, unc_scale=None - - uncApplies = unc_value != None if isMVLnUnc else unc.appliesTo(process, era, channel, category) - if not uncApplies: continue - if not process.hasCompatibleModelParams(model_params, self.model.param_dependent_bkg): continue - - - nominal_shape = None - shapes = {} - if unc.needShapes: - model_params = self.param_bins.get(param_str, None) - nominal_shape = self.getShape(self.processes[proc], era, channel, category, model_params) - - - for unc_scale in [ UncertaintyScale.Up, UncertaintyScale.Down ]: - shapes[unc_scale] = self.getShape(self.processes[proc], era, channel, category, model_params, - unc_name, unc_scale.name) - - unc_to_apply = unc.resolveType(nominal_shape, shapes, self.autolnNThr, self.asymlnNThr) - can_ignore = unc_to_apply.canIgnore(unc_value, self.ignorelnNThr) if isMVLnUnc else unc_to_apply.canIgnore(self.ignorelnNThr) - if can_ignore: - print(f"Ignoring uncertainty {unc_name} for {proc} in {era} {channel} {category}") - continue - systMap = unc_to_apply.valueToMap(unc_value) if isMVLnUnc else unc_to_apply.valueToMap() - cb_copy = self.cbCopy(param_str, proc, era, channel, category) - cb_copy.AddSyst(self.cb, unc_name, unc_to_apply.type.name, systMap) - if unc_to_apply.type == UncertaintyType.shape: + return False + + cb_copy = self.cbCopy(param_str, proc, era, channel, category) + cb_copy.AddSyst( + self.cb, + unc_name, + UncertaintyType.shape.name, + ShapeUncertainty(unc_name).valueToMap(), + ) shape_set = False + def setShape(syst): - nonlocal shape_set - print(f"Setting unc shape for {syst}") - if shape_set: - raise RuntimeError("Shape already set") - syst.set_shapes(shapes[UncertaintyScale.Up], shapes[UncertaintyScale.Down], nominal_shape) - shape_set = True - self.cbCopy(param_str, proc, era, channel, category).syst_name([unc_name]).ForEachSyst(setShape) - - if self.keep_all_signal_hypothesis_into_single_datacard: - for (param_str, proc_name), params in self.param_of.items(): - for era, channel, category in self.ECC(): - base_name = self.base_of.get(proc_name, proc_name) - process = self.processes[base_name] - if process.is_data: - continue + nonlocal shape_set + print(f"Setting unc shape for {syst}") + if shape_set: + raise RuntimeError("Shape already set") + syst.set_shapes( + shapes[UncertaintyScale.Up], + shapes[UncertaintyScale.Down], + nominal_shape, + ) + shape_set = True + + self.cbCopy(param_str, proc, era, channel, category).syst_name( + [unc_name] + ).ForEachSyst(setShape) + return True + + def getCombinedShape( + self, + process, + era, + channel, + category, + model_params, + unc_name=None, + unc_scale=None, + ): + """Combine histograms from multiple sub-eras for a meta-era. + For meta-eras, this sums histograms from constituent sub-eras. + For regular eras, delegates to getShape.""" + if not self.isMetaEra(era): + # Regular era - just get the shape normally + return self.getShape( + process, era, channel, category, model_params, unc_name, unc_scale + ) + + # Meta-era: combine histograms from all sub-eras + sub_eras = self.getSubEras(era) + combined_hist = None - if isMVLnUnc: - unc_value = self.getMultiValueLnUnc( - unc, unc_name, process, era, channel, category, params + for sub_era in sub_eras: + sub_hist = self.getShape( + process, sub_era, channel, category, model_params, unc_name, unc_scale + ) + if sub_hist is None: + continue + if combined_hist is None: + combined_hist = sub_hist.Clone() + else: + combined_hist.Add(sub_hist) + + return combined_hist + + def getShape( + self, + process, + era, + channel, + category, + model_params, + unc_name=None, + unc_scale=None, + ): + # Handle meta-eras by combining sub-era shapes + if self.isMetaEra(era): + return self.getCombinedShape( + process, era, channel, category, model_params, unc_name, unc_scale + ) + + file_name, file = self.getInputFile(era, model_params) + key = (file_name, process.name, era, channel, category, unc_name, unc_scale) + + if key not in self.shapes: + if process.is_data and (unc_name is not None or unc_scale is not None): + raise RuntimeError("Cannot apply uncertainty to the data process") + if process.is_asimov_data: + hist = None + for bkg_proc in self.processes.values(): + if bkg_proc.is_background: + if bkg_proc.name not in self.channel_processes[channel]: + continue + bkg_hist = self.getShape( + bkg_proc, era, channel, category, model_params + ) + if hist is None: + hist = bkg_hist.Clone() + else: + hist.Add(bkg_hist) + if hist is None: + raise RuntimeError("Cannot create asimov data histogram") + else: + hist_name = f"{channel}/{category}/{process.hist_name}" + hists = [] + if process.subprocesses: + for subp in process.subprocesses: + hist_name = f"{channel}/{category}/{subp}" + if unc_name and unc_scale: + hist_name += f"_{unc_name}_{unc_scale}" + subhist = file.Get(hist_name) + if subhist == None: + raise RuntimeError( + f"Cannot find histogram {hist_name} in {file.GetName()}" + ) + hists.append( + self.hist_binner.applyBinning( + era, channel, category, model_params, subhist + ) + ) + else: + if unc_name and unc_scale: + hist_name += f"_{unc_name}_{unc_scale}" + hist = file.Get(hist_name) + if hist == None: + raise RuntimeError( + f"Cannot find histogram {hist_name} in {file.GetName()}" + ) + hists.append( + self.hist_binner.applyBinning( + era, channel, category, model_params, hist + ) + ) + if len(hists) == 0: + raise RuntimeError(f"hist list is empty for file {file.GetName()}") + hist = hists[0] + if len(hists) > 1: + for histy in hists[1:]: + hist.Add(histy) + + hist.SetDirectory(0) + if process.scale != 1: + hist.Scale(process.scale) + + if process.is_signal and not (unc_name and unc_scale): + param_str = ( + self.model.paramStr(model_params) if model_params else "*" + ) + key_sig = ( + era, + channel, + category, + ( + param_str + if not self.keep_all_signal_hypothesis_into_single_datacard + else "*" + ), + ) + self.signal_hists_by_key.setdefault(key_sig, []).append(hist) + else: + param_str = ( + self.model.paramStr(model_params) if model_params else "*" + ) + key_sig = ( + era, + channel, + category, + ( + param_str + if not self.keep_all_signal_hypothesis_into_single_datacard + else "*" + ), + ) + signals = self.signal_hists_by_key.get(key_sig, []) + relevant_bins = getRelevantBins( + era, + channel, + category, + signals, + self.signalFractionForRelevantBins, + ) + solution = resolveNegativeBins( + hist, + relevant_bins=relevant_bins, + allow_zero_integral=process.allow_zero_integral, + allow_negative_bins_within_error=process.allow_negative_bins_within_error, + max_n_sigma_for_negative_bins=process.max_n_sigma_for_negative_bins, + allow_negative_integral=process.allow_negative_integral, ) - uncApplies = (unc_value is not None) if isMVLnUnc else unc.appliesTo(process, era, channel, category) - if not uncApplies: - continue - if not process.hasCompatibleModelParams(params, self.model.param_dependent_bkg): - continue - nominal_shape = None - shapes = {} - if unc.needShapes: - nominal_shape = self.getShape(process, era, channel, category, params) - for us in [UncertaintyScale.Up, UncertaintyScale.Down]: - shapes[us] = self.getShape(process, era, channel, category, params, unc_name, us.name) + if not solution.accepted: + axis = hist.GetXaxis() + bins_edges = [ + str(axis.GetBinLowEdge(n)) + for n in range(1, axis.GetNbins() + 2) + ] + bin_values = [ + str(hist.GetBinContent(n)) + for n in range(1, axis.GetNbins() + 1) + ] + bin_errors = [ + str(hist.GetBinError(n)) + for n in range(1, axis.GetNbins() + 1) + ] + print(f'bins_edges: [ {", ".join(bins_edges)} ]') + print(f'bin_values: [ {", ".join(bin_values)} ]') + print(f'bin_errors: [ {", ".join(bin_errors)} ]') + raise RuntimeError( + f"Negative bins found in histogram for {channel}/{category}/{process.hist_name}" + + ( + f" (syst {unc_name}{unc_scale})" + if unc_name and unc_scale + else "" + ) + ) + self.shapes[key] = hist + return self.shapes[key] + + def addProcess(self, proc, era, channel, category): + bin_idx, bin_name = self.getBin(era, channel, category) + process = self.processes[proc] + + def add(model_params, param_str, process_name): + if process.is_data: + self.cb.AddObservations( + [param_str], + [self.analysis], + [era], + [channel], + [(bin_idx, bin_name)], + ) + else: + self.cb.AddProcesses( + [param_str], + [self.analysis], + [era], + [channel], + [process_name], + [(bin_idx, bin_name)], + process.is_signal, + ) + + shape = self.getShape(process, era, channel, category, model_params) + shape_set = False + + def setShape(p): + nonlocal shape_set + print(f"Setting shape for {p}") + if shape_set: + raise RuntimeError("Shape already set") + p.set_shape(shape, True) + shape_set = True + + cb_copy = self.cbCopy(param_str, process_name, era, channel, category) + if process.is_data: + cb_copy.ForEachObs(setShape) + else: + cb_copy.ForEachProc(setShape) - unc_to_apply = unc.resolveType(nominal_shape, shapes, self.autolnNThr, self.asymlnNThr) - if (isMVLnUnc and unc_to_apply.canIgnore(unc_value, self.ignorelnNThr)) or \ - (not isMVLnUnc and unc_to_apply.canIgnore(self.ignorelnNThr)): + if process.is_signal: + model_params = process.params + param_str = self.model.paramStr(model_params) + if self.keep_all_signal_hypothesis_into_single_datacard: + actual_proc_name = f"{process.name}_{param_str}" + add(model_params, "*", actual_proc_name) + self.param_of[("*", actual_proc_name)] = model_params + self.base_of[actual_proc_name] = process.name + else: + actual_proc_name = process.name + add(model_params, param_str, actual_proc_name) + self.param_of[(param_str, actual_proc_name)] = model_params + self.base_of[actual_proc_name] = process.name + + elif self.model.param_dependent_bkg: + for signal_proc in self.processes.values(): + if not signal_proc.is_signal: continue + model_params = signal_proc.params + param_str = ( + self.model.paramStr(model_params) + if not self.keep_all_signal_hypothesis_into_single_datacard + else "*" + ) + add(model_params, param_str, proc) + self.param_of[(param_str, proc)] = model_params + self.base_of[proc] = proc + else: + add(None, "*", proc) + + def addUncertainty(self, unc_name): + unc = self.uncertainties[unc_name] + isMVLnUnc = isinstance(unc, MultiValueLnNUncertainty) + + for proc, param_str, era, channel, category in self.PPECC(): + if proc not in self.channel_processes[channel]: + continue + process = self.processes[proc] + if process.is_data: + continue + model_params = self.param_bins.get(param_str, None) + if not process.hasCompatibleModelParams( + model_params, self.model.param_dependent_bkg + ): + continue + + if self.isMetaEra(era) and unc.type == UncertaintyType.lnN: + self._addMetaEraLnNAsShapeUnc( + unc_name, + proc, + param_str, + process, + era, + channel, + category, + model_params, + ) + continue + + if isMVLnUnc: + unc_value = unc.getUncertaintyForProcess( + process.name, era, channel, category + ) + + uncApplies = ( + unc_value != None + if isMVLnUnc + else unc.appliesTo(process, era, channel, category) + ) + if not uncApplies: + continue + + nominal_shape = None + shapes = {} + if unc.needShapes: + model_params = self.param_bins.get(param_str, None) + nominal_shape = self.getShape( + self.processes[proc], era, channel, category, model_params + ) + + for unc_scale in [UncertaintyScale.Up, UncertaintyScale.Down]: + shapes[unc_scale] = self.getShape( + self.processes[proc], + era, + channel, + category, + model_params, + unc_name, + unc_scale.name, + ) + + unc_to_apply = unc.resolveType( + nominal_shape, shapes, self.autolnNThr, self.asymlnNThr + ) + can_ignore = ( + unc_to_apply.canIgnore(unc_value, self.ignorelnNThr) + if isMVLnUnc + else unc_to_apply.canIgnore(self.ignorelnNThr) + ) + if can_ignore: + print( + f"Ignoring uncertainty {unc_name} for {proc} in {era} {channel} {category}" + ) + continue + systMap = ( + unc_to_apply.valueToMap(unc_value) + if isMVLnUnc + else unc_to_apply.valueToMap() + ) + cb_copy = self.cbCopy(param_str, proc, era, channel, category) + cb_copy.AddSyst(self.cb, unc_name, unc_to_apply.type.name, systMap) + if unc_to_apply.type == UncertaintyType.shape: + shape_set = False + + def setShape(syst): + nonlocal shape_set + print(f"Setting unc shape for {syst}") + if shape_set: + raise RuntimeError("Shape already set") + syst.set_shapes( + shapes[UncertaintyScale.Up], + shapes[UncertaintyScale.Down], + nominal_shape, + ) + shape_set = True + + self.cbCopy(param_str, proc, era, channel, category).syst_name( + [unc_name] + ).ForEachSyst(setShape) + + if self.keep_all_signal_hypothesis_into_single_datacard: + for (param_str, proc_name), params in self.param_of.items(): + for era, channel, category in self.ECC(): + base_name = self.base_of.get(proc_name, proc_name) + process = self.processes[base_name] + if process.is_data: + continue + if not process.hasCompatibleModelParams( + params, self.model.param_dependent_bkg + ): + continue + + if self.isMetaEra(era) and unc.type == UncertaintyType.lnN: + self._addMetaEraLnNAsShapeUnc( + unc_name, + proc_name, + param_str, + process, + era, + channel, + category, + params, + ) + continue + + if isMVLnUnc: + unc_value = unc.getUncertaintyForProcess( + process.name, era, channel, category + ) + uncApplies = ( + (unc_value is not None) + if isMVLnUnc + else unc.appliesTo(process, era, channel, category) + ) + if not uncApplies: + continue + + nominal_shape = None + shapes = {} + if unc.needShapes: + nominal_shape = self.getShape( + process, era, channel, category, params + ) + for us in [UncertaintyScale.Up, UncertaintyScale.Down]: + shapes[us] = self.getShape( + process, + era, + channel, + category, + params, + unc_name, + us.name, + ) + + unc_to_apply = unc.resolveType( + nominal_shape, shapes, self.autolnNThr, self.asymlnNThr + ) + if ( + isMVLnUnc + and unc_to_apply.canIgnore(unc_value, self.ignorelnNThr) + ) or (not isMVLnUnc and unc_to_apply.canIgnore(self.ignorelnNThr)): + continue + + systMap = ( + unc_to_apply.valueToMap(unc_value) + if isMVLnUnc + else unc_to_apply.valueToMap() + ) + cb_copy = self.cbCopy(param_str, proc_name, era, channel, category) + cb_copy.AddSyst(self.cb, unc_name, unc_to_apply.type.name, systMap) + + if unc_to_apply.type == UncertaintyType.shape: + + def setShape(syst): + syst.set_shapes( + shapes[UncertaintyScale.Up], + shapes[UncertaintyScale.Down], + nominal_shape, + ) + + self.cbCopy( + param_str, proc_name, era, channel, category + ).syst_name([unc_name]).ForEachSyst(setShape) + + def writeDatacards(self, output): + os.makedirs(output, exist_ok=True) + + if self.keep_all_signal_hypothesis_into_single_datacard: + dc_file = os.path.join(output, "datacard_combined_signals.txt") + main_shape_file = os.path.join(output, "combined_signals_all.root") + self.cb.cp().mass(["*"]).WriteDatacard(dc_file, main_shape_file) + + for subera, subchannel, subcat in self.ECC(): + tmp_dir = os.path.join(output, subera, subchannel, subcat) + os.makedirs(tmp_dir, exist_ok=True) + _, bin_name = self.getBin(subera, subchannel, subcat) + perbin_dc = os.path.join(tmp_dir, f"datacard_{bin_name}.txt") + perbin_root = os.path.join(tmp_dir, f"{bin_name}.root") + self.cb.cp().bin([bin_name]).mass(["*"]).WriteDatacard( + perbin_dc, perbin_root + ) + + return + + background_names = [n for n, p in self.processes.items() if p.is_background] + for proc_name, process in self.processes.items(): + if not process.is_signal: + continue + processes = [proc_name] + background_names + param_list = [self.model.paramStr(process.params)] + if not self.model.param_dependent_bkg: + param_list.append("*") + dc_file = os.path.join(output, f"datacard_{proc_name}.txt") + shape_file = os.path.join(output, f"{proc_name}.root") + + for subera in self.eras: + for subchannel in self.channels: + tmp_output = os.path.join(output, subera, subchannel) + os.makedirs(tmp_output, exist_ok=True) + tmp_dc_file = os.path.join(tmp_output, f"datacard_{proc_name}.txt") + tmp_shape_file = shape_file + self.cb.cp().era([subera]).channel([subchannel]).mass( + param_list + ).process(processes).WriteDatacard(tmp_dc_file, tmp_shape_file) + + # Also write datacards for meta-eras + for meta_era in self.era_groups.keys(): + for subchannel in self.channels: + tmp_output = os.path.join(output, meta_era, subchannel) + os.makedirs(tmp_output, exist_ok=True) + tmp_dc_file = os.path.join(tmp_output, f"datacard_{proc_name}.txt") + tmp_shape_file = shape_file + self.cb.cp().era([meta_era]).channel([subchannel]).mass( + param_list + ).process(processes).WriteDatacard(tmp_dc_file, tmp_shape_file) + + self.cb.cp().mass(param_list).process(processes).WriteDatacard( + dc_file, shape_file + ) - systMap = unc_to_apply.valueToMap(unc_value) if isMVLnUnc else unc_to_apply.valueToMap() - cb_copy = self.cbCopy(param_str, proc_name, era, channel, category) - cb_copy.AddSyst(self.cb, unc_name, unc_to_apply.type.name, systMap) - - if unc_to_apply.type == UncertaintyType.shape: - def setShape(syst): - syst.set_shapes(shapes[UncertaintyScale.Up], shapes[UncertaintyScale.Down], nominal_shape) - self.cbCopy(param_str, proc_name, era, channel, category).syst_name([unc_name]).ForEachSyst(setShape) - - def writeDatacards(self, output): - os.makedirs(output, exist_ok=True) - - if self.keep_all_signal_hypothesis_into_single_datacard: - dc_file = os.path.join(output, "datacard_combined_signals.txt") - main_shape_file = os.path.join(output, "combined_signals_all.root") - self.cb.cp().mass(['*']).WriteDatacard(dc_file, main_shape_file) - - for subera, subchannel, subcat in self.ECC(): - tmp_dir = os.path.join(output, subera, subchannel, subcat) - os.makedirs(tmp_dir, exist_ok=True) - _, bin_name = self.getBin(subera, subchannel, subcat) - perbin_dc = os.path.join(tmp_dir, f"datacard_{bin_name}.txt") - perbin_root = os.path.join(tmp_dir, f"{bin_name}.root") - self.cb.cp().bin([bin_name]).mass(['*']).WriteDatacard(perbin_dc, perbin_root) - - return - - background_names = [n for n,p in self.processes.items() if p.is_background] - for proc_name, process in self.processes.items(): - if not process.is_signal: continue - processes = [proc_name] + background_names - param_list = [self.model.paramStr(process.params)] - if not self.model.param_dependent_bkg: - param_list.append('*') - dc_file = os.path.join(output, f"datacard_{proc_name}.txt") - shape_file = os.path.join(output, f"{proc_name}.root") - - for subera in self.eras: - for subchannel in self.channels: - tmp_output = os.path.join(output, subera, subchannel) - os.makedirs(tmp_output, exist_ok=True) - tmp_dc_file = os.path.join(tmp_output, f"datacard_{proc_name}.txt") - tmp_shape_file = shape_file - self.cb.cp().era([subera]).channel([subchannel]).mass(param_list).process(processes).WriteDatacard(tmp_dc_file, tmp_shape_file) - - self.cb.cp().mass(param_list).process(processes).WriteDatacard(dc_file, shape_file) - - - - def createDatacards(self, output, verbose=1): - try: - for era, channel, category in self.ECC(): - for name, p in self.processes.items(): - if name not in self.channel_processes[channel]: - continue - if p.is_signal: - self.addProcess(name, era, channel, category) - for era, channel, category in self.ECC(): - for name, p in self.processes.items(): - if name not in self.channel_processes[channel]: - continue - if not p.is_signal: - self.addProcess(name, era, channel, category) - - for unc_name in self.uncertainties.keys(): - print(f"adding uncertainty: {unc_name}") - self.addUncertainty(unc_name) - if self.autoMCStats["apply"]: - self.cb.SetAutoMCStats(self.cb, self.autoMCStats["threshold"], self.autoMCStats["apply_to_signal"], - self.autoMCStats["mode"]) - if verbose > 0: - self.cb.PrintAll() - self.writeDatacards(output) - finally: - for file in self.input_files.values(): - file.Close() + def createDatacards(self, output, verbose=1): + try: + for era, channel, category in self.ECC(): + for name, p in self.processes.items(): + if name not in self.channel_processes[channel]: + continue + if p.is_signal: + self.addProcess(name, era, channel, category) + for era, channel, category in self.ECC(): + for name, p in self.processes.items(): + if name not in self.channel_processes[channel]: + continue + if not p.is_signal: + self.addProcess(name, era, channel, category) + + for unc_name in self.uncertainties.keys(): + print(f"adding uncertainty: {unc_name}") + self.addUncertainty(unc_name) + if self.autoMCStats["apply"]: + self.cb.SetAutoMCStats( + self.cb, + self.autoMCStats["threshold"], + self.autoMCStats["apply_to_signal"], + self.autoMCStats["mode"], + ) + if verbose > 0: + self.cb.PrintAll() + self.writeDatacards(output) + finally: + for file in self.input_files.values(): + file.Close() diff --git a/dc_make/uncertainty.py b/dc_make/uncertainty.py index 1b4b5ad..4981f7c 100644 --- a/dc_make/uncertainty.py +++ b/dc_make/uncertainty.py @@ -4,306 +4,386 @@ from enum import Enum from ..common.tools import importROOT + ROOT = importROOT() from CombineHarvester.CombineTools.ch import SystMap + class UncertaintyScale(Enum): - Up = 1 - Down = -1 + Up = 1 + Down = -1 + class UncertaintyType(Enum): - auto = 0 - lnN = 1 - shape = 2 + auto = 0 + lnN = 1 + shape = 2 + class Uncertainty: - def __init__(self, name, processes=None, eras=None, channels=None, categories=None, hasMultipleValues=False): - self.name = name - self.processes = processes - self.eras = eras - self.channels = channels - self.categories = categories - self.hasMultipleValues=hasMultipleValues - - def appliesTo(self, process, era, channel, category): - match_subprocess = any(map(lambda p: Uncertainty.hasMatch(p, self.processes), process.subprocesses)) if process.subprocesses else False - return (Uncertainty.hasMatch(process.name, self.processes) or match_subprocess) \ - and Uncertainty.hasMatch(era, self.eras) \ - and Uncertainty.hasMatch(channel, self.channels) \ - and Uncertainty.hasMatch(category, self.categories) - - def resolveType(self, nominal_shape, shape_variations, auto_lnN_threshold, asym_value_threshold): - return self - - @staticmethod - def fitFlat(histogram): - def constantFunction(x,par): - return par[0] - fit_func = ROOT.TF1("fit_func", constantFunction, 0, 10, 1) - fit_func.SetParameter(0, 1.0) - histogram.Fit(fit_func, "nq") - chi2 = fit_func.GetChisquare() - ndf = fit_func.GetNDF() - p_value = ROOT.TMath.Prob(chi2, ndf) - fit_param = fit_func.GetParameter(0) - fit_param_error = fit_func.GetParError(0) - return chi2, p_value, fit_param, fit_param_error - - @staticmethod - def haveCompatibleShapes(hist_a, hist_b, p_thr): - hist_b = hist_b.Clone() - hist_b.Divide(hist_a) - _, p_value, _, _ = Uncertainty.fitFlat(hist_b) - return p_value > p_thr - - @staticmethod - def hasMatch(value, patterns): - if len(patterns) == 0: - return True - for pattern in patterns: - if pattern[0] == '^': - if re.match(pattern, value): - return True - elif value == pattern: - return True - return False - - @staticmethod - def fromConfig(entry): - name = entry["name"] - # print(name) - unc_type = UncertaintyType[entry["type"]] - hasMultipleValues = entry.get("hasMultipleValues", False) - def getPatternList(key, entry=entry): - if key not in entry: - return [] - v = entry[key] - if type(v) == str: - return [v] - return v - - args = {} - for key in ["processes", "eras", "channels", "categories"]: - args[key] = getPatternList(key) - - if unc_type == UncertaintyType.lnN: - value = entry.get("value") - # print(value) - if value is None: - raise RuntimeError("Uncertainty must have a value") - if isinstance(value, (float, int)): - value = float(value) - unc = LnNUncertainty(name, value, **args) - unc.checkValue() - elif isinstance(value, list) and hasMultipleValues: - multi_values = {} - - for sub_entry in value: - key_value = (tuple(getPatternList("processes", sub_entry)), - tuple(getPatternList("eras", sub_entry)), - tuple(getPatternList("channels", sub_entry)), - tuple(getPatternList("categories", sub_entry))) - for key in ["processes", "eras", "channels", "categories"]: - args[key] = getPatternList(key,sub_entry) - multi_values[key_value] = sub_entry["value"] - if isinstance(sub_entry["value"], list) and len(sub_entry["value"]) == 2: - multi_values[key_value] = {UncertaintyScale.Down: sub_entry["value"][0], UncertaintyScale.Up: sub_entry["value"][1]} - unc = MultiValueLnNUncertainty(name, multi_values,**args) - unc.checkValue() - - elif isinstance(value, list) and len(value) == 2 and not hasMultipleValues: - value = {UncertaintyScale.Down: value[0], UncertaintyScale.Up: value[1]} - unc = LnNUncertainty(name, value, **args) - unc.checkValue() - elif isinstance(value, dict): - unc = LnNUncertainty(name, value, **args) - unc.checkValue() - else: - raise RuntimeError("Invalid lnN uncertainty value") - elif unc_type == UncertaintyType.shape: - unc = ShapeUncertainty(name, **args) - elif unc_type == UncertaintyType.auto: - unc = AutoUncertainty(name, **args) - else: - raise RuntimeError("Invalid uncertainty type") - - return unc + def __init__( + self, + name, + processes=None, + eras=None, + channels=None, + categories=None, + hasMultipleValues=False, + ): + self.name = name + self.processes = processes + self.eras = eras + self.channels = channels + self.categories = categories + self.hasMultipleValues = hasMultipleValues + + def appliesTo(self, process, era, channel, category): + match_subprocess = ( + any( + map( + lambda p: Uncertainty.hasMatch(p, self.processes), + process.subprocesses, + ) + ) + if process.subprocesses + else False + ) + return ( + (Uncertainty.hasMatch(process.name, self.processes) or match_subprocess) + and Uncertainty.hasMatch(era, self.eras) + and Uncertainty.hasMatch(channel, self.channels) + and Uncertainty.hasMatch(category, self.categories) + ) + + def resolveType( + self, nominal_shape, shape_variations, auto_lnN_threshold, asym_value_threshold + ): + return self + + @staticmethod + def fitFlat(histogram): + def constantFunction(x, par): + return par[0] + + fit_func = ROOT.TF1("fit_func", constantFunction, 0, 10, 1) + fit_func.SetParameter(0, 1.0) + histogram.Fit(fit_func, "nq") + chi2 = fit_func.GetChisquare() + ndf = fit_func.GetNDF() + p_value = ROOT.TMath.Prob(chi2, ndf) + fit_param = fit_func.GetParameter(0) + fit_param_error = fit_func.GetParError(0) + return chi2, p_value, fit_param, fit_param_error + + @staticmethod + def haveCompatibleShapes(hist_a, hist_b, p_thr): + hist_b = hist_b.Clone() + hist_b.Divide(hist_a) + _, p_value, _, _ = Uncertainty.fitFlat(hist_b) + return p_value > p_thr + + @staticmethod + def hasMatch(value, patterns): + if len(patterns) == 0: + return True + for pattern in patterns: + if pattern[0] == "^": + if re.match(pattern, value): + return True + elif value == pattern: + return True + return False + + @staticmethod + def fromConfig(entry): + name = entry["name"] + # print(name) + unc_type = UncertaintyType[entry["type"]] + hasMultipleValues = entry.get("hasMultipleValues", False) + + def getPatternList(key, entry=entry): + if key not in entry: + return [] + v = entry[key] + if type(v) == str: + return [v] + return v + + args = {} + for key in ["processes", "eras", "channels", "categories"]: + args[key] = getPatternList(key) + + if unc_type == UncertaintyType.lnN: + value = entry.get("value") + # print(value) + if value is None: + raise RuntimeError("Uncertainty must have a value") + if isinstance(value, (float, int)): + value = float(value) + unc = LnNUncertainty(name, value, **args) + unc.checkValue() + elif isinstance(value, list) and hasMultipleValues: + multi_values = {} + + for sub_entry in value: + key_value = ( + tuple(getPatternList("processes", sub_entry)), + tuple(getPatternList("eras", sub_entry)), + tuple(getPatternList("channels", sub_entry)), + tuple(getPatternList("categories", sub_entry)), + ) + for key in ["processes", "eras", "channels", "categories"]: + args[key] = getPatternList(key, sub_entry) + multi_values[key_value] = sub_entry["value"] + if ( + isinstance(sub_entry["value"], list) + and len(sub_entry["value"]) == 2 + ): + multi_values[key_value] = { + UncertaintyScale.Down: sub_entry["value"][0], + UncertaintyScale.Up: sub_entry["value"][1], + } + unc = MultiValueLnNUncertainty(name, multi_values, **args) + unc.checkValue() + + elif isinstance(value, list) and len(value) == 2 and not hasMultipleValues: + value = {UncertaintyScale.Down: value[0], UncertaintyScale.Up: value[1]} + unc = LnNUncertainty(name, value, **args) + unc.checkValue() + elif isinstance(value, dict): + unc = LnNUncertainty(name, value, **args) + unc.checkValue() + else: + raise RuntimeError("Invalid lnN uncertainty value") + elif unc_type == UncertaintyType.shape: + unc = ShapeUncertainty(name, **args) + elif unc_type == UncertaintyType.auto: + unc = AutoUncertainty(name, **args) + else: + raise RuntimeError("Invalid uncertainty type") + return unc -class LnNUncertainty(Uncertainty): - def __init__(self, name, value, **kwargs): - super().__init__(name, **kwargs) - self.value = value - - @property - def type(self): - return UncertaintyType.lnN - - @property - def needShapes(self): - return False - - def canIgnore(self, value_thr): - if type(self.value) == float: - return abs(self.value) < value_thr - return abs(self.value[UncertaintyScale.Up]) < value_thr and abs(self.value[UncertaintyScale.Down]) < value_thr - - def checkValue(self, raise_error=True): - pass_zero_check = True - pass_sign_check = True - pass_magnitude_check = True - if type(self.value) == float: - - if self.value == 0: - pass_zero_check = False - if abs(self.value) >= 1: - pass_magnitude_check = False - elif type(self.value) == dict: - if self.value[UncertaintyScale.Down] == 0 or self.value[UncertaintyScale.Up] == 0: - pass_zero_check = False - if self.value[UncertaintyScale.Down] * self.value[UncertaintyScale.Up] > 0: - pass_sign_check = False - if abs(self.value[UncertaintyScale.Down]) >= 1 or abs(self.value[UncertaintyScale.Up]) >= 1: - pass_magnitude_check = False - else: - raise RuntimeError("Invalid lnN uncertainty value") - - if not pass_zero_check and raise_error: - raise RuntimeError(f"Value of lnN uncertainty {self.name} cannot be zero") - if not pass_sign_check and raise_error: - raise RuntimeError(f"Up/down values of lnN uncertainty {self.name} must have opposite signs") - if not pass_magnitude_check and raise_error: - raise RuntimeError(f"Value of lnN uncertainty {self.name} must be less than 100%") - return pass_zero_check, pass_sign_check, pass_magnitude_check - - def valueToMap(self, digits=3): - if type(self.value) == float: - value = round(1 + self.value, digits) - else: - v_down = round(1 + self.value[UncertaintyScale.Down], digits) - v_up = round(1 + self.value[UncertaintyScale.Up], digits) - value = (v_down, v_up) - return SystMap()(value) -class MultiValueLnNUncertainty(Uncertainty): - def __init__(self, name, values, **kwargs): - super().__init__(name, **kwargs) - # self.name = name - self.values = values - - @property - def type(self): - return UncertaintyType.lnN - - @property - def needShapes(self): - return False - - @classmethod - def fromConfig(cls, cfg): - name = cfg['name'] - values = cfg['value'] - return cls(name, values) - - def canIgnore(self, unc_value, value_thr): - if type(unc_value) == float: - return abs(unc_value) < value_thr - return abs(unc_value[UncertaintyScale.Up]) < value_thr and abs(unc_value[UncertaintyScale.Down]) < value_thr - - - def checkValue(self): - for key in self.values.keys(): - processes, eras, channels, categories = key - value = self.values[key] +class LnNUncertainty(Uncertainty): + def __init__(self, name, value, **kwargs): + super().__init__(name, **kwargs) + self.value = value + + @property + def type(self): + return UncertaintyType.lnN + + @property + def needShapes(self): + return False + + def canIgnore(self, value_thr): + if type(self.value) == float: + return abs(self.value) < value_thr + return ( + abs(self.value[UncertaintyScale.Up]) < value_thr + and abs(self.value[UncertaintyScale.Down]) < value_thr + ) + + def checkValue(self, raise_error=True): pass_zero_check = True pass_sign_check = True pass_magnitude_check = True - if type(value) == float: - - if value == 0: - pass_zero_check = False - if abs(value) >= 1: - pass_magnitude_check = False - elif type(value) == dict: - if value[UncertaintyScale.Down] == 0 or value[UncertaintyScale.Up] == 0: - pass_zero_check = False - if value[UncertaintyScale.Down] * value[UncertaintyScale.Up] > 0: - pass_sign_check = False - if abs(value[UncertaintyScale.Down]) >= 1 or abs(value[UncertaintyScale.Up]) >= 1: - pass_magnitude_check = False + if type(self.value) == float: + + if self.value == 0: + pass_zero_check = False + if abs(self.value) >= 1: + pass_magnitude_check = False + elif type(self.value) == dict: + if ( + self.value[UncertaintyScale.Down] == 0 + or self.value[UncertaintyScale.Up] == 0 + ): + pass_zero_check = False + if self.value[UncertaintyScale.Down] * self.value[UncertaintyScale.Up] > 0: + pass_sign_check = False + if ( + abs(self.value[UncertaintyScale.Down]) >= 1 + or abs(self.value[UncertaintyScale.Up]) >= 1 + ): + pass_magnitude_check = False else: - raise RuntimeError("Invalid lnN uncertainty value") - # if not isinstance(value, (int, float)): - # raise ValueError(f"Invalid uncertainty value: {value}. Must be numeric.") - # if not isinstance(processes, tuple) or not all(isinstance(p, str) for p in processes): - # raise ValueError(f"Invalid processes list: {processes}. Must be a list of strings.") - - def getUncertaintyForProcess(self, process): - for key in self.values.keys(): - processes, eras, channels, categories = key - if process in processes: - return self.values[key] - return None - - def valueToMap(self, unc_value, digits=3): - if type(unc_value) == float: - value = round(1 + unc_value, digits) - else: - v_down = round(1 + unc_value[UncertaintyScale.Down], digits) - v_up = round(1 + unc_value[UncertaintyScale.Up], digits) - value = (v_down, v_up) - return SystMap()(value) + raise RuntimeError("Invalid lnN uncertainty value") + + if not pass_zero_check and raise_error: + raise RuntimeError(f"Value of lnN uncertainty {self.name} cannot be zero") + if not pass_sign_check and raise_error: + raise RuntimeError( + f"Up/down values of lnN uncertainty {self.name} must have opposite signs" + ) + if not pass_magnitude_check and raise_error: + raise RuntimeError( + f"Value of lnN uncertainty {self.name} must be less than 100%" + ) + return pass_zero_check, pass_sign_check, pass_magnitude_check + + def valueToMap(self, digits=3): + if type(self.value) == float: + value = round(1 + self.value, digits) + else: + v_down = round(1 + self.value[UncertaintyScale.Down], digits) + v_up = round(1 + self.value[UncertaintyScale.Up], digits) + value = (v_down, v_up) + return SystMap()(value) + + +class MultiValueLnNUncertainty(Uncertainty): + def __init__(self, name, values, **kwargs): + super().__init__(name, **kwargs) + # self.name = name + self.values = values + + @property + def type(self): + return UncertaintyType.lnN + + @property + def needShapes(self): + return False + + @classmethod + def fromConfig(cls, cfg): + name = cfg["name"] + values = cfg["value"] + return cls(name, values) + + def canIgnore(self, unc_value, value_thr): + if type(unc_value) == float: + return abs(unc_value) < value_thr + return ( + abs(unc_value[UncertaintyScale.Up]) < value_thr + and abs(unc_value[UncertaintyScale.Down]) < value_thr + ) + + def checkValue(self): + for key in self.values.keys(): + processes, eras, channels, categories = key + value = self.values[key] + pass_zero_check = True + pass_sign_check = True + pass_magnitude_check = True + if type(value) == float: + + if value == 0: + pass_zero_check = False + if abs(value) >= 1: + pass_magnitude_check = False + elif type(value) == dict: + if value[UncertaintyScale.Down] == 0 or value[UncertaintyScale.Up] == 0: + pass_zero_check = False + if value[UncertaintyScale.Down] * value[UncertaintyScale.Up] > 0: + pass_sign_check = False + if ( + abs(value[UncertaintyScale.Down]) >= 1 + or abs(value[UncertaintyScale.Up]) >= 1 + ): + pass_magnitude_check = False + else: + raise RuntimeError("Invalid lnN uncertainty value") + # if not isinstance(value, (int, float)): + # raise ValueError(f"Invalid uncertainty value: {value}. Must be numeric.") + # if not isinstance(processes, tuple) or not all(isinstance(p, str) for p in processes): + # raise ValueError(f"Invalid processes list: {processes}. Must be a list of strings.") + + def getUncertaintyForProcess(self, process, era=None, channel=None, category=None): + for key in self.values.keys(): + processes, eras, channels, categories = key + if processes and process not in processes: + continue + if era is not None and eras and era not in eras: + continue + if channel is not None and channels and channel not in channels: + continue + if category is not None and categories and category not in categories: + continue + return self.values[key] + return None + + def valueToMap(self, unc_value, digits=3): + if type(unc_value) == float: + value = round(1 + unc_value, digits) + else: + v_down = round(1 + unc_value[UncertaintyScale.Down], digits) + v_up = round(1 + unc_value[UncertaintyScale.Up], digits) + value = (v_down, v_up) + return SystMap()(value) class ShapeUncertainty(Uncertainty): - @property - def type(self): - return UncertaintyType.shape + @property + def type(self): + return UncertaintyType.shape - @property - def needShapes(self): - return True + @property + def needShapes(self): + return True + + def canIgnore(self, value_thr): + return False - def canIgnore(self, value_thr): - return False + def valueToMap(self, digits=3): + return SystMap()(1.0) - def valueToMap(self, digits=3): - return SystMap()(1.0) class AutoUncertainty(Uncertainty): - @property - def needShapes(self): - return True - - def canIgnore(self, value_thr): - raise RuntimeError("Uncertainty type is not resolved yet") - - def valueToMap(self, digits=3): - raise RuntimeError("Uncertainty type is not resolved yet") - - def resolveType(self, nominal_shape, shape_variations, auto_lnN_threshold, asym_value_threshold): - nominal_shape = nominal_shape.Clone() - for n in range(0, nominal_shape.GetNbinsX() + 2): - nominal_shape.SetBinError(n, 0) - all_compatible = True - for unc_scale in [ UncertaintyScale.Up, UncertaintyScale.Down ]: - if not Uncertainty.haveCompatibleShapes(nominal_shape, shape_variations[unc_scale], auto_lnN_threshold): - all_compatible = False - break - args = { "processes": self.processes, "eras": self.eras, "channels": self.channels, "categories": self.categories } - if all_compatible: - unc_value = {} - nominal_integral = nominal_shape.Integral() - for unc_scale in [ UncertaintyScale.Up, UncertaintyScale.Down ]: - scaled_integral = shape_variations[unc_scale].Integral() - unc_value[unc_scale] = (scaled_integral - nominal_integral) / nominal_integral - unc = LnNUncertainty(self.name, unc_value, **args) - pass_zero_check, pass_sign_check, pass_magnitude_check = unc.checkValue(raise_error=False) - if pass_sign_check and pass_magnitude_check: - delta = abs(unc.value[UncertaintyScale.Up]) - abs(unc.value[UncertaintyScale.Down]) - if abs(delta) <= asym_value_threshold: - max_value = max(abs(unc.value[UncertaintyScale.Up]), abs(unc.value[UncertaintyScale.Down])) - unc.value = math.copysign(max_value, unc.value[UncertaintyScale.Up]) - return unc - return ShapeUncertainty(self.name, **args) + @property + def needShapes(self): + return True + + def canIgnore(self, value_thr): + raise RuntimeError("Uncertainty type is not resolved yet") + + def valueToMap(self, digits=3): + raise RuntimeError("Uncertainty type is not resolved yet") + + def resolveType( + self, nominal_shape, shape_variations, auto_lnN_threshold, asym_value_threshold + ): + nominal_shape = nominal_shape.Clone() + for n in range(0, nominal_shape.GetNbinsX() + 2): + nominal_shape.SetBinError(n, 0) + all_compatible = True + for unc_scale in [UncertaintyScale.Up, UncertaintyScale.Down]: + if not Uncertainty.haveCompatibleShapes( + nominal_shape, shape_variations[unc_scale], auto_lnN_threshold + ): + all_compatible = False + break + args = { + "processes": self.processes, + "eras": self.eras, + "channels": self.channels, + "categories": self.categories, + } + if all_compatible: + unc_value = {} + nominal_integral = nominal_shape.Integral() + for unc_scale in [UncertaintyScale.Up, UncertaintyScale.Down]: + scaled_integral = shape_variations[unc_scale].Integral() + unc_value[unc_scale] = ( + scaled_integral - nominal_integral + ) / nominal_integral + unc = LnNUncertainty(self.name, unc_value, **args) + pass_zero_check, pass_sign_check, pass_magnitude_check = unc.checkValue( + raise_error=False + ) + if pass_sign_check and pass_magnitude_check: + delta = abs(unc.value[UncertaintyScale.Up]) - abs( + unc.value[UncertaintyScale.Down] + ) + if abs(delta) <= asym_value_threshold: + max_value = max( + abs(unc.value[UncertaintyScale.Up]), + abs(unc.value[UncertaintyScale.Down]), + ) + unc.value = math.copysign(max_value, unc.value[UncertaintyScale.Up]) + return unc + return ShapeUncertainty(self.name, **args) From b46fcb3d908c3ff3ba2394ed0ebd51fa13f42e76 Mon Sep 17 00:00:00 2001 From: Devin Date: Wed, 17 Jun 2026 22:55:36 +0200 Subject: [PATCH 2/2] Restructure meta eras to now be handled through the normal eras list --- dc_make/maker.py | 133 ++++++++++++++++------------------------------- 1 file changed, 45 insertions(+), 88 deletions(-) diff --git a/dc_make/maker.py b/dc_make/maker.py index ae1fa61..50e3f7b 100644 --- a/dc_make/maker.py +++ b/dc_make/maker.py @@ -18,6 +18,7 @@ UncertaintyType, UncertaintyScale, MultiValueLnNUncertainty, + LnNUncertainty, ShapeUncertainty, ) from .model import Model @@ -57,22 +58,7 @@ def __init__( self.categories = cfg["categories"] self.signalFractionForRelevantBins = cfg["signalFractionForRelevantBins"] - # Load era groups if defined (e.g., Early_Run3 combining multiple eras) self.era_groups = cfg.get("era_groups", {}) - # Create reverse mapping: sub_era -> meta_era - self.era_to_group = {} - for group_name, sub_eras in self.era_groups.items(): - for sub_era in sub_eras: - if sub_era not in self.era_to_group: - self.era_to_group[sub_era] = [] - self.era_to_group[sub_era].append(group_name) - - # Expand eras to include both regular eras and meta-eras - all_eras = list(self.eras) - for group_name in self.era_groups.keys(): - if group_name not in all_eras: - all_eras.append(group_name) - self.all_eras = all_eras self.bins = [] for era, channel, cat in self.ECC(): @@ -162,7 +148,7 @@ def getBin(self, era, channel, category, return_name=True, return_index=True): def getSubEras(self, era): """Get sub-eras for a given era. If era is a meta-era, return its sub-eras. Otherwise return [era].""" - if era in self.era_groups: + if self.isMetaEra(era): return self.era_groups[era] return [era] @@ -175,18 +161,14 @@ def cbCopy(self, param_str, process, era, channel, category): return self.cb.cp().mass([param_str]).process([process]).bin([bin_name]) def ECC(self): - return itertools.product(self.all_eras, self.channels, self.categories) + return itertools.product(self.eras, self.channels, self.categories) def PPECC(self): param_bins = list(self.param_bins.keys()) if not self.model.param_dependent_bkg: param_bins.append("*") return itertools.product( - self.processes.keys(), - param_bins, - self.all_eras, - self.channels, - self.categories, + self.processes.keys(), param_bins, self.eras, self.channels, self.categories ) def getInputFile(self, era, model_params): @@ -233,62 +215,49 @@ def _loadBinnedHist(self, file, era, channel, category, model_params, hist_name) def _getSubEraLnNVariedShapes( self, unc, process, sub_era, channel, category, model_params ): - if process.subprocesses and isinstance(unc, MultiValueLnNUncertainty): - file_name, file = self.getInputFile(sub_era, model_params) - up_hist = None - down_hist = None - applies = False - - for subp in process.subprocesses: - hist = self._loadBinnedHist( - file, - sub_era, - channel, - category, - model_params, - f"{channel}/{category}/{subp}", - ) - unc_value = self._getLnNValue( - unc, process, subp, sub_era, channel, category - ) - if unc_value is not None: - applies = True - sub_up = self._applyLnNToHist(hist, unc_value, UncertaintyScale.Up) - sub_down = self._applyLnNToHist( - hist, unc_value, UncertaintyScale.Down - ) - else: - sub_up = hist.Clone() - sub_down = hist.Clone() - sub_up.SetDirectory(0) - sub_down.SetDirectory(0) - - if up_hist is None: - up_hist = sub_up - down_hist = sub_down - else: - up_hist.Add(sub_up) - down_hist.Add(sub_down) - - if process.scale != 1: - up_hist.Scale(process.scale) - down_hist.Scale(process.scale) - return up_hist, down_hist, applies - - nominal_sub = self.getShape(process, sub_era, channel, category, model_params) - unc_value = self._getLnNValue( - unc, process, process.name, sub_era, channel, category + file_name, file = self.getInputFile(sub_era, model_params) + hist_names = ( + [(subp, subp) for subp in process.subprocesses] + if process.subprocesses + else [(process.hist_name, process.name)] ) - if unc_value is None: - up_hist = nominal_sub.Clone() - down_hist = nominal_sub.Clone() - up_hist.SetDirectory(0) - down_hist.SetDirectory(0) - return up_hist, down_hist, False + up_hist = None + down_hist = None + applies = False + + for hist_name_suffix, proc_name_for_unc in hist_names: + hist = self._loadBinnedHist( + file, + sub_era, + channel, + category, + model_params, + f"{channel}/{category}/{hist_name_suffix}", + ) + unc_value = self._getLnNValue( + unc, process, proc_name_for_unc, sub_era, channel, category + ) + if unc_value is not None: + applies = True + sub_up = self._applyLnNToHist(hist, unc_value, UncertaintyScale.Up) + sub_down = self._applyLnNToHist(hist, unc_value, UncertaintyScale.Down) + else: + sub_up = hist.Clone() + sub_down = hist.Clone() + sub_up.SetDirectory(0) + sub_down.SetDirectory(0) + + if up_hist is None: + up_hist = sub_up + down_hist = sub_down + else: + up_hist.Add(sub_up) + down_hist.Add(sub_down) - up_hist = self._applyLnNToHist(nominal_sub, unc_value, UncertaintyScale.Up) - down_hist = self._applyLnNToHist(nominal_sub, unc_value, UncertaintyScale.Down) - return up_hist, down_hist, True + if process.scale != 1: + up_hist.Scale(process.scale) + down_hist.Scale(process.scale) + return up_hist, down_hist, applies def getMetaEraLnNShapeUnc(self, unc, process, era, channel, category, model_params): if not self.isMetaEra(era): @@ -587,7 +556,6 @@ def add(model_params, param_str, process_name): def setShape(p): nonlocal shape_set - print(f"Setting shape for {p}") if shape_set: raise RuntimeError("Shape already set") p.set_shape(shape, True) @@ -854,17 +822,6 @@ def writeDatacards(self, output): param_list ).process(processes).WriteDatacard(tmp_dc_file, tmp_shape_file) - # Also write datacards for meta-eras - for meta_era in self.era_groups.keys(): - for subchannel in self.channels: - tmp_output = os.path.join(output, meta_era, subchannel) - os.makedirs(tmp_output, exist_ok=True) - tmp_dc_file = os.path.join(tmp_output, f"datacard_{proc_name}.txt") - tmp_shape_file = shape_file - self.cb.cp().era([meta_era]).channel([subchannel]).mass( - param_list - ).process(processes).WriteDatacard(tmp_dc_file, tmp_shape_file) - self.cb.cp().mass(param_list).process(processes).WriteDatacard( dc_file, shape_file )