In [1]:
import uproot
import awkward as ak
import numpy as np
input_tree = uproot.open({"/work/guenther/reco_tuner/data/param_data_selected.root": "Selected"})
array = input_tree.arrays()
array["dSlope_xEndT"] = array["tx_l11"] - array["tx"]
array["dSlope_yEndT"] = array["ty_l11"] - array["ty"]
array["dSlope_xEndT_abs"] = abs(array["dSlope_xEndT"])
array["dSlope_yEndT_abs"] = abs(array["dSlope_yEndT"])
array["yStraightEndT"] = array["y"] + array["ty"] * ( 9410. - array["z"])
array["yDiffEndT"] = (array["y_l11"] + array["ty_l11"] * ( 9410. - array["z_l11"])) - array["yStraightEndT"]

def format_array(name, coef):
 coef = [str(c)+"f" for c in coef if c != 0.0]
 code = f"constexpr std::array {name}"
 code += "{" + ", ".join(list(coef)) +"};"
 return code

In [89]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

features = [
 "ty", 
 "tx",
 "dSlope_xEndT",
 "dSlope_yEndT",
 "dSlope_xEndT_abs",
 "dSlope_yEndT_abs",
]
target_feat = "yDiffEndT"

data = np.column_stack([ak.to_numpy(array[feat]) for feat in features])
target = ak.to_numpy(array[target_feat])
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

poly = PolynomialFeatures(degree=6, include_bias=False)
X_train_model = poly.fit_transform( X_train )
X_test_model = poly.fit_transform( X_test )
poly_features = poly.get_feature_names_out(input_features=features)
keep = [
 #'dSlope_xEndT',
 'dSlope_yEndT', # keep
 #'dSlope_yEndT_abs',
 #'ty dSlope_xEndT',
 #'ty dSlope_yEndT',
 'ty dSlope_xEndT_abs', # keep
 'ty dSlope_yEndT_abs', #keep
 'ty dSlope_yEndT^2', # keep 
 'ty dSlope_xEndT^2', # keep
 #'tx dSlope_xEndT',
 #'tx dSlope_xEndT_abs',
 #'tx dSlope_yEndT',
 'ty tx dSlope_xEndT', #keep
 'tx^2 dSlope_yEndT', # keep
 #'ty^2 dSlope_xEndT',
 #'ty^2 dSlope_yEndT', 
 #'ty^2 dSlope_xEndT_abs',
 #'ty^2 tx dSlope_xEndT',
 #'ty tx^2 dSlope_yEndT',
 'ty tx^2 dSlope_xEndT_abs', # keep
 'ty^3 tx dSlope_xEndT', #keep
 #'ty tx^3 dSlope_xEndT',
 #'ty^3 dSlope_yEndT_abs',
]
do_not_keep = [
 'dSlope_xEndT',
 'dSlope_yEndT_abs',
 'ty dSlope_xEndT',
 'tx dSlope_xEndT',
 'tx dSlope_xEndT_abs',
 'tx dSlope_yEndT',
 'ty^2 dSlope_xEndT',
 'ty^3 dSlope_yEndT_abs',
 'ty tx dSlope_yEndT',
 'ty tx^3 dSlope_xEndT',
 'ty tx^2 dSlope_yEndT',
]
reduce = True
if reduce:
 remove = [i for i, f in enumerate(poly_features) if (keep and f not in keep )]
 X_train_model = np.delete( X_train_model, remove, axis=1)
 X_test_model = np.delete( X_test_model, remove, axis=1)
 poly_features = np.delete(poly_features, remove )
 print(poly_features)
if not reduce:
 remove = [i for i, f in enumerate(poly_features) if ("dSlope_" not in f) or ("EndT^" in f) or ("abs^" in f) or ("EndT dSlope" in f) or ("abs dSlope" in f)]
 X_train_model = np.delete( X_train_model, remove, axis=1)
 X_test_model = np.delete( X_test_model, remove, axis=1)
 poly_features = np.delete(poly_features, remove )
 #print(poly_features)
 lin_reg = Lasso(fit_intercept=False, alpha=0.000001)
else:
 lin_reg = LinearRegression(fit_intercept=False)
lin_reg.fit( X_train_model, y_train)
y_pred_test = lin_reg.predict( X_test_model )
print("intercept=", lin_reg.intercept_)
print("coef=", {k: v for k, v in zip(poly_features, lin_reg.coef_) if abs(v) > 1.0 and k not in keep and k not in do_not_keep})
print("r2 score=", lin_reg.score(X_test_model, y_test))
print("RMSE =", mean_squared_error(y_test, y_pred_test, squared=False))
print("straight RMSE =", mean_squared_error(array["y_l11"], array["y"] + array["ty"] * ( array["z_l11"] - array["z"] ), squared=False))
print(format_array("y_xEndT_diff", lin_reg.coef_))

['dSlope_yEndT' 'ty dSlope_yEndT_abs' 'ty tx dSlope_xEndT'
 'ty dSlope_xEndT^2' 'ty dSlope_yEndT^2' 'tx^2 dSlope_yEndT'
 'ty tx^2 dSlope_xEndT_abs' 'ty^3 tx dSlope_xEndT']
intercept= 0.0
coef= {}
r2 score= 0.9971571295750978
RMSE = 2.422206064647647
straight RMSE = 45.67726454181064
constexpr std::array y_xEndT_diff{4039.5218935644916f, 1463.501458069602f, 2210.102099471291f, 1537.0718454152473f, -411.54564619803864f, 2594.7244053238287f, -1030.7643414023526f, 14904.842115636024f};
