You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

173 lines
6.4 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import uproot\n",
"import awkward as ak\n",
"import numpy as np\n",
"input_tree = uproot.open({\"/work/guenther/reco_tuner/data/param_data_selected.root\": \"Selected\"})\n",
"array = input_tree.arrays()\n",
"array[\"dSlope_xEndT\"] = array[\"tx_l11\"] - array[\"tx\"]\n",
"array[\"dSlope_yEndT\"] = array[\"ty_l11\"] - array[\"ty\"]\n",
"array[\"dSlope_xEndT_abs\"] = abs(array[\"dSlope_xEndT\"])\n",
"array[\"dSlope_yEndT_abs\"] = abs(array[\"dSlope_yEndT\"])\n",
"array[\"yStraightEndT\"] = array[\"y\"] + array[\"ty\"] * ( 9410. - array[\"z\"])\n",
"array[\"yDiffEndT\"] = (array[\"y_l11\"] + array[\"ty_l11\"] * ( 9410. - array[\"z_l11\"])) - array[\"yStraightEndT\"]\n",
"\n",
"def format_array(name, coef):\n",
" coef = [str(c)+\"f\" for c in coef if c != 0.0]\n",
" code = f\"constexpr std::array {name}\"\n",
" code += \"{\" + \", \".join(list(coef)) +\"};\"\n",
" return code"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['dSlope_yEndT' 'ty dSlope_yEndT_abs' 'ty tx dSlope_xEndT'\n",
" 'ty dSlope_xEndT^2' 'ty dSlope_yEndT^2' 'tx^2 dSlope_yEndT'\n",
" 'ty tx^2 dSlope_xEndT_abs' 'ty^3 tx dSlope_xEndT']\n",
"intercept= 0.0\n",
"coef= {}\n",
"r2 score= 0.9971571295750978\n",
"RMSE = 2.422206064647647\n",
"straight RMSE = 45.67726454181064\n",
"constexpr std::array y_xEndT_diff{4039.5218935644916f, 1463.501458069602f, 2210.102099471291f, 1537.0718454152473f, -411.54564619803864f, 2594.7244053238287f, -1030.7643414023526f, 14904.842115636024f};\n"
]
}
],
"source": [
"from sklearn.preprocessing import PolynomialFeatures\n",
"from sklearn.linear_model import LinearRegression, Lasso, Ridge\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.metrics import mean_squared_error\n",
"\n",
"features = [\n",
" \"ty\", \n",
" \"tx\",\n",
" \"dSlope_xEndT\",\n",
" \"dSlope_yEndT\",\n",
" \"dSlope_xEndT_abs\",\n",
" \"dSlope_yEndT_abs\",\n",
"]\n",
"target_feat = \"yDiffEndT\"\n",
"\n",
"data = np.column_stack([ak.to_numpy(array[feat]) for feat in features])\n",
"target = ak.to_numpy(array[target_feat])\n",
"X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)\n",
"\n",
"poly = PolynomialFeatures(degree=6, include_bias=False)\n",
"X_train_model = poly.fit_transform( X_train )\n",
"X_test_model = poly.fit_transform( X_test )\n",
"poly_features = poly.get_feature_names_out(input_features=features)\n",
"keep = [\n",
" #'dSlope_xEndT',\n",
" 'dSlope_yEndT', # keep\n",
" #'dSlope_yEndT_abs',\n",
" #'ty dSlope_xEndT',\n",
" #'ty dSlope_yEndT',\n",
" 'ty dSlope_xEndT_abs', # keep\n",
" 'ty dSlope_yEndT_abs', #keep\n",
" 'ty dSlope_yEndT^2', # keep \n",
" 'ty dSlope_xEndT^2', # keep\n",
" #'tx dSlope_xEndT',\n",
" #'tx dSlope_xEndT_abs',\n",
" #'tx dSlope_yEndT',\n",
" 'ty tx dSlope_xEndT', #keep\n",
" 'tx^2 dSlope_yEndT', # keep\n",
" #'ty^2 dSlope_xEndT',\n",
" #'ty^2 dSlope_yEndT', \n",
" #'ty^2 dSlope_xEndT_abs',\n",
" #'ty^2 tx dSlope_xEndT',\n",
" #'ty tx^2 dSlope_yEndT',\n",
" 'ty tx^2 dSlope_xEndT_abs', # keep\n",
" 'ty^3 tx dSlope_xEndT', #keep\n",
" #'ty tx^3 dSlope_xEndT',\n",
" #'ty^3 dSlope_yEndT_abs',\n",
"]\n",
"do_not_keep = [\n",
" 'dSlope_xEndT',\n",
" 'dSlope_yEndT_abs',\n",
" 'ty dSlope_xEndT',\n",
" 'tx dSlope_xEndT',\n",
" 'tx dSlope_xEndT_abs',\n",
" 'tx dSlope_yEndT',\n",
" 'ty^2 dSlope_xEndT',\n",
" 'ty^3 dSlope_yEndT_abs',\n",
" 'ty tx dSlope_yEndT',\n",
" 'ty tx^3 dSlope_xEndT',\n",
" 'ty tx^2 dSlope_yEndT',\n",
"]\n",
"reduce = True\n",
"if reduce:\n",
" remove = [i for i, f in enumerate(poly_features) if (keep and f not in keep )]\n",
" X_train_model = np.delete( X_train_model, remove, axis=1)\n",
" X_test_model = np.delete( X_test_model, remove, axis=1)\n",
" poly_features = np.delete(poly_features, remove )\n",
" print(poly_features)\n",
"if not reduce:\n",
" remove = [i for i, f in enumerate(poly_features) if (\"dSlope_\" not in f) or (\"EndT^\" in f) or (\"abs^\" in f) or (\"EndT dSlope\" in f) or (\"abs dSlope\" in f)]\n",
" X_train_model = np.delete( X_train_model, remove, axis=1)\n",
" X_test_model = np.delete( X_test_model, remove, axis=1)\n",
" poly_features = np.delete(poly_features, remove )\n",
" #print(poly_features)\n",
" lin_reg = Lasso(fit_intercept=False, alpha=0.000001)\n",
"else:\n",
" lin_reg = LinearRegression(fit_intercept=False)\n",
"lin_reg.fit( X_train_model, y_train)\n",
"y_pred_test = lin_reg.predict( X_test_model )\n",
"print(\"intercept=\", lin_reg.intercept_)\n",
"print(\"coef=\", {k: v for k, v in zip(poly_features, lin_reg.coef_) if abs(v) > 1.0 and k not in keep and k not in do_not_keep})\n",
"print(\"r2 score=\", lin_reg.score(X_test_model, y_test))\n",
"print(\"RMSE =\", mean_squared_error(y_test, y_pred_test, squared=False))\n",
"print(\"straight RMSE =\", mean_squared_error(array[\"y_l11\"], array[\"y\"] + array[\"ty\"] * ( array[\"z_l11\"] - array[\"z\"] ), squared=False))\n",
"print(format_array(\"y_xEndT_diff\", lin_reg.coef_))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.6 (conda)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "a2eff8b4da8b8eebf5ee2e5f811f31a557e0a202b4d2f04f849b065340a6eda6"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}