#ifndef BDT_CLASSIFICATION #define BDT_CLASSIFICATION #include #include #include #include #include #include #include "RtypesCore.h" enum TVT { Double, Float, Int }; class TV { private: std::string data_name; std::string mc_name; std::string train_name; TVT type; Double_t mc_double_value; Float_t mc_float_value; Double_t data_double_value; Float_t data_float_value; TV(std::string data_name, std::string mc_name, std::string train_name, TVT type) : data_name{data_name}, mc_name{mc_name}, train_name{train_name}, type{type} { } public: static TV *Float(std::string data_name, std::string mc_name, std::string train_name) { return new TV(data_name, mc_name, train_name, TVT::Float); } static TV *Float(std::string data_name, std::string mc_name) { return new TV(data_name, mc_name, data_name, TVT::Float); } static TV *Float(std::string data_name) { return new TV(data_name, data_name, data_name, TVT::Float); } static TV *Double(std::string data_name, std::string mc_name, std::string train_name) { return new TV(data_name, mc_name, train_name, TVT::Double); } static TV *Double(std::string data_name, std::string mc_name) { return new TV(data_name, mc_name, data_name, TVT::Double); } static TV *Double(std::string data_name) { return new TV(data_name, data_name, data_name, TVT::Double); } const char *GetDataName() { return data_name.c_str(); } const char *GetMCName() { return mc_name.c_str(); } const char *GetTrainName() { return train_name.c_str(); } Double_t *GetMCDoubleRef() { return &mc_double_value; } Float_t *GetMCFloatRef() { return &mc_float_value; } Double_t *GetDataDoubleRef() { return &data_double_value; } Float_t *GetDataFloatRef() { return &data_float_value; } Double_t GetDataDouble() { return data_double_value; } Float_t GetDataFloat() { return data_float_value; } void PrintDataValue(int entry) { std::cout << data_name << " (" << entry << "): "; if (IsDouble()) { std::cout << data_double_value; } else if (IsFloat()) { std::cout << data_float_value; } std::cout << std::endl; } void PrintMCValue(int entry) { std::cout << mc_name << " (" << entry << "): "; if (IsDouble()) { std::cout << mc_double_value; } else if (IsFloat()) { std::cout << mc_float_value; } std::cout << std::endl; } bool IsDataFinite() { if (IsDouble()) { return std::isfinite(data_double_value); } else if (IsFloat()) { return std::isfinite(data_float_value); } return false; } bool IsMCFinite() { if (IsDouble()) { return std::isfinite(mc_double_value); } else if (IsFloat()) { return std::isfinite(mc_float_value); } return false; } bool IsDouble() { return type == TVT::Double; } bool IsFloat() { return type == TVT::Float; } }; void ConnectVarsToData(std::vector vars, TChain *data_chain, TChain *mc_chain, TTree *sig_tree, TTree *bkg_tree) { for (size_t i = 0; i < vars.size(); i++) { if (vars[i]->IsDouble()) { data_chain->SetBranchAddress(vars[i]->GetDataName(), vars[i]->GetDataDoubleRef()); mc_chain->SetBranchAddress(vars[i]->GetMCName(), vars[i]->GetMCDoubleRef()); sig_tree->Branch(vars[i]->GetTrainName(), vars[i]->GetMCDoubleRef(), TString::Format("%s/D", vars[i]->GetTrainName())); bkg_tree->Branch(vars[i]->GetTrainName(), vars[i]->GetDataDoubleRef(), TString::Format("%s/D", vars[i]->GetTrainName())); } else if (vars[i]->IsFloat()) { data_chain->SetBranchAddress(vars[i]->GetDataName(), vars[i]->GetDataFloatRef()); mc_chain->SetBranchAddress(vars[i]->GetMCName(), vars[i]->GetMCFloatRef()); sig_tree->Branch(vars[i]->GetTrainName(), vars[i]->GetMCFloatRef(), TString::Format("%s/F", vars[i]->GetTrainName())); bkg_tree->Branch(vars[i]->GetTrainName(), vars[i]->GetDataFloatRef(), TString::Format("%s/F", vars[i]->GetTrainName())); } } } void TrainBDT(std::vector vars, const char* unique_id, TTree *sig_tree, TTree *bkg_tree) { TString outfile_name = TString::Format("%s_tmva_out.root", unique_id); TFile *output_file = TFile::Open(outfile_name, "RECREATE"); TString factory_options("V:Silent:Color:DrawProgressBar:Transformations=I;D;P;G,D:AnalysisType=Auto"); TMVA::Factory *factory = new TMVA::Factory(TString::Format("%s_factory", unique_id), output_file, factory_options); TMVA::DataLoader *data_loader = new TMVA::DataLoader(TString::Format("%s_dataloader", unique_id)); for (int i = 0; i < vars.size(); i++) { std::cout << "@TMVA: Adding Branch: " << vars[i]->GetTrainName() << std::endl; if (vars[i]->IsDouble()) { data_loader->AddVariable(vars[i]->GetTrainName(), 'D'); } else if (vars[i]->IsFloat()) { data_loader->AddVariable(vars[i]->GetTrainName(), 'F'); } } Double_t signal_weight = 1.0, background_weight = 1.0; data_loader->AddSignalTree(sig_tree, signal_weight); data_loader->AddBackgroundTree(bkg_tree, background_weight); data_loader->PrepareTrainingAndTestTree("", "", "nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=EqualNumEvents:!V"); factory->BookMethod(data_loader, TMVA::Types::kBDT, "BDT", "!H:!V:NTrees=600:MinNodeSize=2.5%:CreateMVAPdfs:MaxDepth=3:BoostType=AdaBoost:AdaBoostBeta=0.5:UseBaggedBoost:BaggedSampleFraction=0.5:SeparationType=GiniIndex:nCuts=20"); factory->TrainAllMethods(); factory->TestAllMethods(); factory->EvaluateAllMethods(); output_file->Close(); } TMVA::Reader* SetupReader(std::vector vars, Float_t* train_vars, const char* unique_id) { TMVA::Reader *reader = new TMVA::Reader("!Color:!Silent"); for (size_t i = 0; i < vars.size(); i++) { reader->AddVariable(vars[i]->GetTrainName(), &train_vars[i]); } reader->BookMVA("BDT", TString::Format("./%s_dataloader/weights/%s_factory_BDT.weights.xml", unique_id, unique_id)); return reader; } void DrawBDTProbs(TH1D *histogram, const double cut_value, const char *folder) { std::filesystem::create_directory(TString::Format("output_files/analysis/%s", folder).Data()); TString name = TString::Format("%s_canvas", histogram->GetName()); TCanvas *c = new TCanvas(name, histogram->GetName(), 0, 0, 800, 600); histogram->SetStats(0); histogram->Draw(); TLine* line = new TLine(cut_value, 0, cut_value, histogram->GetMaximum()); line->SetLineColor(kRed); line->SetLineStyle(kDashed); line->Draw(); c->Draw(); c->SaveAs(TString::Format("output_files/analysis/%s/%s.pdf", folder, name.Data()).Data()); } #endif