EWP-BplusToKstMuMu-AngAna/Code/Selection/PlotTMVA.cpp

665 lines
20 KiB
C++
Raw Normal View History

//Functions to make pretty plots from ROOT TMVA training output
//Renata Kopecna
#include "GlobalFunctions.hh"
#include "Design.hpp"
#include "Paths.hpp"
#include "MVAclass.hpp"
#include <sys/stat.h> //mkdir
using namespace std;
TFile *GetTMVAfile(int year, int Run, bool SplitYears, bool KShortDecaysInVelo, int nConfiguration, bool UseLowQ2Range, string customTMbranch, bool gammaTM){
TFile * file = new TFile(GetBDTConfigFile(SplitYears,year,Run,KShortDecaysInVelo,nConfiguration,UseLowQ2Range, customTMbranch, gammaTM).c_str(),"OPEN");
if (file == NULL) coutERROR("Cannot read input file " + GetBDTConfigFile(SplitYears,year,Run,KShortDecaysInVelo,nConfiguration,UseLowQ2Range, customTMbranch, gammaTM) + "!");
std::cout << "Reading file '" << GetBDTConfigFile(SplitYears,year,Run,KShortDecaysInVelo,nConfiguration,UseLowQ2Range, customTMbranch, gammaTM) << "'" << std::endl;
return file;
}
TH2F *GetCorrelationMatrix(TFile *file, string type){ //type = S,B
string path = "CorrelationMatrix" + type;
TH2F *h2_corrMatrix = (TH2F*)file->Get(path.c_str());
if (h2_corrMatrix == NULL) coutERROR("Couldn't get correlation matrix for " + type + "!");
return h2_corrMatrix;
}
TH2F *GetCorrelationMatrixSignal(TFile *file){
return GetCorrelationMatrix(file,"S");
}
TH2F *GetCorrelationMatrixBackground(TFile *file){
return GetCorrelationMatrix(file,"B");
}
string oberFolder(string method){
return (method.find("BDT") != std::string::npos ? "BDT" : method);
}
TH1F *GetTMVAresponse(TFile *file, string method, string type){ //type = S,B, Train_S, Train_B
string path = "Method_" + oberFolder(method) + "/" + method + "/MVA_" + method + "_" + type;
TH1F *h_TMVAresp = (TH1F*)file->Get(path.c_str());
if (h_TMVAresp == NULL) coutERROR("Couldn't get TMVA response for " + method + " and type " + type + "!");
return h_TMVAresp;
}
TH1F *GetTMVAresponseSignal(TFile *file, string method){
return GetTMVAresponse(file, method,"S");
}
TH1F *GetTMVAresponseBackground(TFile *file, string method){
return GetTMVAresponse(file, method,"B");
}
TH1F *GetTMVAresponseTrainingSignal(TFile *file, string method){
return GetTMVAresponse(file, method,"Train_S");
}
TH1F *GetTMVAresponseTrainingBackground(TFile *file, string method){
return GetTMVAresponse(file, method,"Train_B");
}
TH1F *GetVariableSignal(TFile *file, string variable){
string path = "InputVariables_Id/"+variable+"__Signal_Id";
TH1F *h_varS = (TH1F*)file->Get(path.c_str());
if (h_varS == NULL) coutERROR("Couldn't get " + variable + "_S distribution!");
return h_varS;
}
TH1F *GetVariableBackground(TFile *file, string variable){
string path = "InputVariables_Id/"+variable+"__Background_Id";
TH1F *h_varB = (TH1F*)file->Get(path.c_str());
if (h_varB == NULL) coutERROR("Couldn't get " + variable + "_B distribution!");
return h_varB;
}
TH1F *GetEfficiencySignal(TFile *file, string method){
string path = "Method_" + oberFolder(method) + "/" +method + "/MVA_" + method + "_effS";
TH1F *h_effS = (TH1F*)file->Get(path.c_str());
if (h_effS == NULL) coutERROR("Couldn't get signal efficiency for " + method + "!");
return h_effS;
}
TH1F *GetEfficiencyBackground(TFile *file, string method){
string path = "Method_" + oberFolder(method) + "/" +method + "/MVA_" + method + "_effB";
TH1F *h_effB = (TH1F*)file->Get(path.c_str());
if (h_effB == NULL) coutERROR("Couldn't get background efficiency for " + method + "!");
return h_effB;
}
TH1F *GetROC(TFile *file, string method){
string path = "Method_" + oberFolder(method) + "/" +method + "/MVA_" + method + "_rejBvsS";
TH1F *h_ROC = (TH1F*)file->Get(path.c_str());
if (h_ROC == NULL) coutERROR("Couldn't get ROC curve for " + method + "!");
return h_ROC;
}
TTree *GetTrainTree(TFile *file){
TTree *t_train = (TTree*)file->Get("TrainTree");
return t_train;
}
TTree *GetTestTree(TFile *file){
TTree *t_test = (TTree*)file->Get("TestTree");
return t_test;
}
int GetTrainEvents(TFile *file){
TTree *t_train = (TTree*)file->Get("TrainTree");
int n = t_train->GetEntries();
delete t_train;
return n;
}
int GetTrainSignalEvents(TFile *file){
TTree *t_train = (TTree*)file->Get("TrainTree");
int nSig = t_train->Draw("classID","classID==0");
delete t_train;
coutDebug("nSig = " + to_string(nSig));
return nSig;
}
int GetTrainBackgroundEvents(TFile *file){
TTree *t_train = (TTree*)file->Get("TrainTree");
int nBkg = t_train->Draw("classID","classID==1");
delete t_train;
coutDebug("nBkg = " + to_string(nBkg));
return nBkg;
}
//Make plots from file
bool SaveCorrelationPlot(TFile *file, string type, string savePath){
TH2F *h_corr = GetCorrelationMatrix(file, type); //S or B
if (h_corr == NULL) return 0;
DesignCorrelationPlots(h_corr);
TCanvas *c_corr= c_Correlation(type);
c_corr->cd();
h_corr->Draw("COLZTEXT");
string path = savePath + "Correlation"+ type +".eps";
c_corr->SaveAs(path.c_str());
replace(path,".eps",".root");
c_corr->SaveAs(path.c_str());
h_corr->Clear();
delete h_corr;
c_corr->Clear();
delete c_corr;
return 1;
}
//Load all variable names from MVA reader
vector <string> v_variables(string DL="") {
MVA_variables * InputVariables = new MVA_variables(DL);
vector <string> v_var;
for (auto & var : InputVariables->GetAllReaderNames()){
size_t pos = var.find(":");
if (pos != string::npos){
string tmp = var;
tmp.erase(tmp.begin()+pos,tmp.end());
v_var.push_back(tmp);//var.erase(var.begin(),var.end()));
}
else v_var.push_back(var);
}
return v_var;
}
bool SaveVariablesSignalVsBackground(TFile *file, string savePath, string DL=""){
//Check if folder for the plots exists and if not, create one
string folder_path = savePath + "variables/";
struct stat st;
if (stat(folder_path.c_str(),&st)!=0 && mkdir(folder_path.c_str(), 0755)==-1){
coutERROR("Folder "+folder_path+" couldn't be created!");
return 0;
}
//get vector of variables
vector <string> v_var = v_variables(DL);
if (v_var.empty()){
coutERROR("Variable vector is empty!");
return 0;
}
//loop over variables
for (auto& var : v_var){
//Create a TCanvas
TCanvas *c_variable = c_VariablesSignalVsBackground(var);
c_variable->cd();
//Get signal and background histograms
TH1F *h_variableS = GetVariableSignal(file,var);
TH1F *h_variableB = GetVariableBackground(file,var);
//normalize them
h_variableS->Scale(1.0/(h_variableS->GetEntries()*h_variableS->GetXaxis()->GetBinWidth(3)));
h_variableB->Scale(1.0/(h_variableB->GetEntries()*h_variableB->GetXaxis()->GetBinWidth(3)));
//Make it pretty
designVariablesSignalVsBackground(h_variableS,h_variableB);
//Plot it
h_variableS->Draw("HIST");
h_variableB->Draw("HIST SAME");
//Save it
string path = folder_path+var+".eps";
c_variable->SaveAs(path.c_str());
replace(path,".eps",".root");
c_variable->SaveAs(path.c_str());
//Delete it (the comments would make a cool DaftPunk song)
h_variableS->Clear();
h_variableB->Clear();
delete h_variableS;
delete h_variableB;
c_variable->Clear();
delete c_variable;
}
return 1;
}
bool SaveMVAResponse(TFile *file, string savePath, string method){ //Check if folder for the plots exists and if not, create one
//Check for valid method
if (method != "BDT" && method !="BDTG" && method != "MLP"){
coutERROR("Wrong method used in SaveMVAResponse! Choose from [BDT, BDTG, MLP]!");
return 0;
}
//Create a TCanvas
string c_name = method + "_response";
TCanvas *c_response = c_VariablesSignalVsBackground(c_name.c_str());
c_response->cd();
//Get signal and background histograms
TH1F *h_responseS = GetTMVAresponseSignal(file,method);
TH1F *h_responseB = GetTMVAresponseBackground(file,method);
//Set log scale if not BDT
if (method != "BDT") c_response->SetLogy();
//Make it pretty
designResponseSignalVsBackground(h_responseS,h_responseB, method);
//Plot it
h_responseS->Draw("BAR");
h_responseB->Draw("BAR SAME");
//Add legend
TLegend *leg = new TLegend(0.13,0.9,0.4,0.79);
leg->AddEntry(h_responseS, "Signal","f");
leg->AddEntry(h_responseB, "Background","f");
leg->Draw("SAME");
//Save it
string path = savePath+method+"_Response.eps";
c_response->SaveAs(path.c_str());
replace(path,".eps",".root");
c_response->SaveAs(path.c_str());
//Delete it (the comments would make a cool DaftPunk song)
h_responseS->Clear();
h_responseB->Clear();
delete h_responseS;
delete h_responseB;
c_response->Clear();
delete c_response;
return 1;
}
bool SaveMVAOvertraining(TFile *file, string savePath, string method){ //Check if folder for the plots exists and if not, create one
//Check for valid method
if (method != "BDT" && method !="BDTG" && method != "MLP"){
coutERROR("Wrong method used in SaveMVAOvertraining! Choose from [BDT, BDTG, MLP]!");
return 0;
}
//Create a TCanvas
string c_name = method + "_overtrain";
TCanvas *c_overtrain = c_VariablesSignalVsBackground(c_name.c_str());
c_overtrain->cd();
//Get signal and background histograms
TH1F *h_overtrainS = GetTMVAresponseSignal(file,method);
TH1F *h_overtrainB = GetTMVAresponseBackground(file,method);
//Get signal and background histograms from training
TH1F *h_overtrainTrainS = GetTMVAresponseTrainingSignal(file,method);
TH1F *h_overtrainTrainB = GetTMVAresponseTrainingBackground(file,method);
//Set log scale if not BDT
if (method != "BDT") c_overtrain->SetLogy();
//Make it pretty
designResponseSignalVsBackground(h_overtrainS,h_overtrainB, method);
designOvertraining(h_overtrainTrainS,h_overtrainTrainB);
//Plot it
h_overtrainS->Draw("][ HIST");
h_overtrainB->Draw("][ HIST SAME");
h_overtrainTrainS->Draw("PSAME");
h_overtrainTrainB->Draw("PSAME");
//Add legends
TLegend *leg = new TLegend(0.2,0.93,0.55,0.82);;
leg->AddEntry(h_overtrainS, "Signal (test)","f");
leg->AddEntry(h_overtrainB, "Background (test)","f");
leg->Draw("SAME");
TLegend *legTrain = new TLegend(0.55,0.93,0.89,0.82);
legTrain->AddEntry(h_overtrainTrainS, "Signal (training)","lep");
legTrain->AddEntry(h_overtrainTrainB, "Background (training)","lep");
legTrain->Draw("SAME");
//Save it
string path = savePath+method+"_Overtraining.eps";
c_overtrain->SaveAs(path.c_str());
replace(path,".eps",".root");
c_overtrain->SaveAs(path.c_str());
//Delete it
h_overtrainS->Clear();
h_overtrainB->Clear();
h_overtrainTrainS->Clear();
h_overtrainTrainB->Clear();
delete h_overtrainS;
delete h_overtrainB;
delete h_overtrainTrainS;
delete h_overtrainTrainB;
c_overtrain->Clear();
delete c_overtrain;
return 1;
}
bool SaveEfficiency(TFile *file, string savePath, string method, int nSig, int nBkg){
//Create a TCanvas
string c_name = method + "_efficiency";
TCanvas *c_eff = c_Efficiency(c_name.c_str());
c_eff->cd();
//Create two pads (to get significance y-axis on the right)
TPad *pad1 = new TPad("pad1","",0,0,1,1);
TPad *pad2 = new TPad("pad2","",0,0,1,1);
pad2->SetFillStyle(4000); //will be transparent
pad2->SetFrameFillStyle(0);
pad1->SetRightMargin(0.16);
pad2->SetRightMargin(0.16);
pad1->Draw();
pad1->cd();
//Get signal+background efficiency
TH1F *h_efficiencyS = GetEfficiencySignal(file,method);
TH1F *h_efficiencyB = GetEfficiencyBackground(file,method);
//Calculate purity
TH1F *h_purity = (TH1F*)h_efficiencyS->Clone();
h_purity->Scale(nSig);
TH1F *h_SplusB = (TH1F*)h_efficiencyS->Clone();
h_SplusB->Scale(nSig);
h_SplusB->Add(h_efficiencyB,nBkg);
h_purity->Divide(h_SplusB);
//Calculate significance
TH1F *h_significance = (TH1F*)h_efficiencyS->Clone();
h_significance->Scale(nSig);
for (int b = 0; b < h_SplusB->GetNbinsX(); b++){
h_SplusB->SetBinContent(b,TMath::Sqrt(h_SplusB->GetBinContent(b))); //TODO check width
}
h_significance->Divide(h_SplusB);
//Make it pretty
designEfficiency(h_efficiencyS,h_efficiencyB,h_purity,h_significance, method);
//Add legends
TLegend *leg = new TLegend(0.15,0.93,0.52,0.82);
leg->AddEntry(h_efficiencyS, "Signal efficiency","l");
leg->AddEntry(h_efficiencyB, "Background efficiency","l");
TLegend *legSignificance = new TLegend(0.52,0.93,0.8,0.82);
legSignificance->AddEntry(h_purity, "Purity","l");
legSignificance->AddEntry(h_significance, "Significance","l");
//Calculate highest significance
double maxSignificance = h_significance->GetMaximum();
double maxSignificanceMLPcut = h_significance->GetBinCenter(h_significance->GetMaximumBin());
double xmin = h_efficiencyS->GetBinLowEdge(1);
TPaveText *text = significanceText(xmin,nSig, nBkg, maxSignificance, maxSignificanceMLPcut);
//Plot it
pad1->cd();
pad1->SetGridy();
pad1->SetGridx();
h_efficiencyS->Draw("SAME");
h_efficiencyB->Draw("SAME");
h_purity->Draw("SAME");
leg->Draw("SAME");
text->Draw("SAME");
pad1->Update();
c_eff->cd();
pad2->Draw();
pad2->cd();
h_significance->Draw("Y+");
h_significance->Draw("SAME");
legSignificance->Draw("SAME");
pad2->Update();
c_eff->cd();
//Save it
string path = savePath+method+"_Efficiency.eps";
c_eff->SaveAs(path.c_str());
replace(path,".eps",".root");
c_eff->SaveAs(path.c_str());
//Delete it
h_efficiencyS->Clear();
h_efficiencyB->Clear();
h_purity->Clear();
h_significance->Clear();
delete h_efficiencyS;
delete h_efficiencyB;
delete h_purity;
delete h_significance;
c_eff->Clear();
delete c_eff;
return 1;
}
string getROCmethod(TH1F *h_ROC){
string name = h_ROC->GetTitle();
name.erase(0,4); //removes MVA_ from the title
return name;
}
bool SaveROCs(TFile *file, string savePath, vector <string> methods){
//Create a TCanvas
TCanvas *c_ROC = c_ROCplot("ROCs");
c_ROC->cd();
//Get all the histograms
vector <TH1F*> v_h_ROC;
for (auto & method:methods){
if (GetROC(file,method)==NULL){ //Check if histogram exists
coutERROR("Method " +method + "is not in the MVA output file!");
return 0;
}
else v_h_ROC.push_back(GetROC(file,method));
}
//add Legend
TLegend *leg = new TLegend(0.25,0.25,0.5,0.35);
//make it pretty and draw it
for_indexed(auto & h_ROC : v_h_ROC){
leg->AddEntry(h_ROC,getROCmethod(h_ROC).c_str(),"l");
designROC(h_ROC,i);
if (i>0) h_ROC->Draw("C SAME");
else h_ROC->Draw("C");
}
leg->SetBorderSize(0);
leg->SetFillStyle(0);
leg->Draw();
//Save it
string path = savePath+"ROCs.eps";
c_ROC->SaveAs(path.c_str());
replace(path,".eps",".root");
c_ROC->SaveAs(path.c_str());
//Delete it
v_h_ROC.clear();
c_ROC->Clear();
delete c_ROC;
return 1;
}
bool SaveMultipleROCS(vector <TFile *> files, string savePath, string method){
//Create a TCanvas
TCanvas *c_ROC = c_ROCplot("ROCs");
c_ROC->cd();
//Get all the histograms
vector <TH1F*> v_h_ROC;
for (auto & file:files){
if (GetROC(file,method)==NULL){ //Check if histogram exists
coutERROR("Method " +method + "is not in the MVA output file " + file->GetPath() + "!");
return 0;
}
else v_h_ROC.push_back(GetROC(file,method));
}
//add Legend
TLegend *leg = new TLegend(0.22,0.22,0.5,0.38);
//make it pretty and draw it
for_indexed(auto & h_ROC : v_h_ROC){
leg->AddEntry(h_ROC,Form("%s_%i",method.c_str(),((int) i)),"l");
designROC(h_ROC,i);
if (i>0) h_ROC->Draw("C SAME");
else h_ROC->Draw("C");
}
leg->SetBorderSize(0);
leg->SetFillStyle(0);
leg->Draw();
//Save it
string path = savePath+"multipleROCs.eps";
c_ROC->SaveAs(path.c_str());
replace(path,".eps",".root");
c_ROC->SaveAs(path.c_str());
//Delete it
v_h_ROC.clear();
c_ROC->Clear();
delete c_ROC;
return 1;
}
void nEvents(int year, int Run, bool SplitYears, bool KshortDecaysInVelo,int nConfiguration,bool UseLowQ2Range, int& nSig, int& nBkg){
//I haven't figured out a better way to get the total number of signal+background events
int arr[2]={0,0};
nSig = 0;
nBkg = 0;
if (KshortChannel){
//TO FILL
if (!SplitYears){
if (Run==1){
if (KshortDecaysInVelo){ //Run1 LL
arr[0] = 97;
arr[1] = 451;
}
else{ //Run1 DD
arr[0] = 158;
arr[1] = 2035;
}
}
else if (Run == 2){
if (KshortDecaysInVelo){ //Run2 LL
arr[0] = 291;
arr[1] = 1016;
}
else{ //Run2 DD
arr[0] = 492;
arr[1] = 4813;
}
}
else if (Run ==12){
//TO FILL
}
else return;
}
else{ //Split Years
//TO FILL
}
}
else{
if (!SplitYears){
if (Run ==1) {
arr[0] = 67;
arr[1] = 14;
}
else if (Run == 2){
arr[0] = 297;
arr[1] = 203;
}
else if (Run ==12){
//TO FILL
}
else return;
}
else{ //Split Years
//TO FILL
}
}
if(arr[0]==0 && arr[1]==0)
coutWarning("No event numbers have been assigned! Please fill in the hardcoded values in Test.cpp");
else
cout << "[INFO]\t\t nSig=" << arr[0] << " nBkg=" << arr[1] << endl;
nSig = arr[0];
nBkg = arr[1];
return;
}
bool SaveAllFromOneFile(int year, int Run, bool SplitYears, bool KshortDecaysInVelo,int nConfiguration, bool UseLowQ2Range, string customTMbranch, bool gammaTM){
vector <string> methods;
if (KshortChannel) methods = {"BDT","BDTG"};
else methods = {"BDT","BDTG","MLP"};
TFile *file= GetTMVAfile(year, Run, SplitYears, KshortDecaysInVelo, nConfiguration, UseLowQ2Range, customTMbranch, gammaTM);
if (file == NULL) return 0;
//Check if folder for the plots exists and if not, create one
string folder_path = GetTMVAplotsFolder(year, Run, SplitYears, KshortDecaysInVelo,nConfiguration,UseLowQ2Range, customTMbranch, gammaTM);
//First, try through Unix command (easier than trying to make a workaround in c++)
string command = "mkdir -p " + folder_path;
system(command.c_str());
//Now check if it has been created indeed
struct stat st;
if (stat(folder_path.c_str(),&st)!=0){
coutERROR("Folder "+folder_path+" couldn't be created!");
return 0;
}
//Save correlations plots
if (!SaveCorrelationPlot(file,"S",folder_path)) return 0;
if (!SaveCorrelationPlot(file,"B",folder_path)) return 0;
//Save all variables plots
SaveVariablesSignalVsBackground(file, folder_path, (KshortChannel ? (KshortDecaysInVelo ? "LL" : "DD") : ""));
int nSig = 0;
int nBkg = 0;
nEvents(year, Run, SplitYears,KshortDecaysInVelo,nConfiguration,UseLowQ2Range, nSig, nBkg);
for (auto &method : methods){
//Save overtraining
if (!SaveMVAOvertraining(file,folder_path,method)) return 0;
//Save efficiency
if (!SaveEfficiency(file,folder_path,method,nSig, nBkg)) return 0;
}
//Save ROCs
if (!SaveROCs(file, folder_path,methods)) return 0;
return 1;
}
bool SaveMultipleROCS(int Run, int nLow, int nHigh){
vector <TFile*> testFiles;
for (int f = nLow; f < nHigh; f++){
testFiles.push_back(GetTMVAfile(2011,Run,false,false,f,false,"",true));
}
string path = GetTMVAplotsFolder(2011,Run,false,false,0,false);
replace(path,"Config0/","");
coutDebug(path);
return (SaveMultipleROCS(testFiles,path,"MLP"));
}