You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

173 lines
6.4 KiB

10 months ago
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "import uproot\n",
  10. "import awkward as ak\n",
  11. "import numpy as np\n",
  12. "input_tree = uproot.open({\"/work/guenther/reco_tuner/data/param_data_selected.root\": \"Selected\"})\n",
  13. "array = input_tree.arrays()\n",
  14. "array[\"dSlope_xEndT\"] = array[\"tx_l11\"] - array[\"tx\"]\n",
  15. "array[\"dSlope_yEndT\"] = array[\"ty_l11\"] - array[\"ty\"]\n",
  16. "array[\"dSlope_xEndT_abs\"] = abs(array[\"dSlope_xEndT\"])\n",
  17. "array[\"dSlope_yEndT_abs\"] = abs(array[\"dSlope_yEndT\"])\n",
  18. "array[\"yStraightEndT\"] = array[\"y\"] + array[\"ty\"] * ( 9410. - array[\"z\"])\n",
  19. "array[\"yDiffEndT\"] = (array[\"y_l11\"] + array[\"ty_l11\"] * ( 9410. - array[\"z_l11\"])) - array[\"yStraightEndT\"]\n",
  20. "\n",
  21. "def format_array(name, coef):\n",
  22. " coef = [str(c)+\"f\" for c in coef if c != 0.0]\n",
  23. " code = f\"constexpr std::array {name}\"\n",
  24. " code += \"{\" + \", \".join(list(coef)) +\"};\"\n",
  25. " return code"
  26. ]
  27. },
  28. {
  29. "cell_type": "code",
  30. "execution_count": 89,
  31. "metadata": {},
  32. "outputs": [
  33. {
  34. "name": "stdout",
  35. "output_type": "stream",
  36. "text": [
  37. "['dSlope_yEndT' 'ty dSlope_yEndT_abs' 'ty tx dSlope_xEndT'\n",
  38. " 'ty dSlope_xEndT^2' 'ty dSlope_yEndT^2' 'tx^2 dSlope_yEndT'\n",
  39. " 'ty tx^2 dSlope_xEndT_abs' 'ty^3 tx dSlope_xEndT']\n",
  40. "intercept= 0.0\n",
  41. "coef= {}\n",
  42. "r2 score= 0.9971571295750978\n",
  43. "RMSE = 2.422206064647647\n",
  44. "straight RMSE = 45.67726454181064\n",
  45. "constexpr std::array y_xEndT_diff{4039.5218935644916f, 1463.501458069602f, 2210.102099471291f, 1537.0718454152473f, -411.54564619803864f, 2594.7244053238287f, -1030.7643414023526f, 14904.842115636024f};\n"
  46. ]
  47. }
  48. ],
  49. "source": [
  50. "from sklearn.preprocessing import PolynomialFeatures\n",
  51. "from sklearn.linear_model import LinearRegression, Lasso, Ridge\n",
  52. "from sklearn.model_selection import train_test_split\n",
  53. "from sklearn.pipeline import Pipeline\n",
  54. "from sklearn.metrics import mean_squared_error\n",
  55. "\n",
  56. "features = [\n",
  57. " \"ty\", \n",
  58. " \"tx\",\n",
  59. " \"dSlope_xEndT\",\n",
  60. " \"dSlope_yEndT\",\n",
  61. " \"dSlope_xEndT_abs\",\n",
  62. " \"dSlope_yEndT_abs\",\n",
  63. "]\n",
  64. "target_feat = \"yDiffEndT\"\n",
  65. "\n",
  66. "data = np.column_stack([ak.to_numpy(array[feat]) for feat in features])\n",
  67. "target = ak.to_numpy(array[target_feat])\n",
  68. "X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)\n",
  69. "\n",
  70. "poly = PolynomialFeatures(degree=6, include_bias=False)\n",
  71. "X_train_model = poly.fit_transform( X_train )\n",
  72. "X_test_model = poly.fit_transform( X_test )\n",
  73. "poly_features = poly.get_feature_names_out(input_features=features)\n",
  74. "keep = [\n",
  75. " #'dSlope_xEndT',\n",
  76. " 'dSlope_yEndT', # keep\n",
  77. " #'dSlope_yEndT_abs',\n",
  78. " #'ty dSlope_xEndT',\n",
  79. " #'ty dSlope_yEndT',\n",
  80. " 'ty dSlope_xEndT_abs', # keep\n",
  81. " 'ty dSlope_yEndT_abs', #keep\n",
  82. " 'ty dSlope_yEndT^2', # keep \n",
  83. " 'ty dSlope_xEndT^2', # keep\n",
  84. " #'tx dSlope_xEndT',\n",
  85. " #'tx dSlope_xEndT_abs',\n",
  86. " #'tx dSlope_yEndT',\n",
  87. " 'ty tx dSlope_xEndT', #keep\n",
  88. " 'tx^2 dSlope_yEndT', # keep\n",
  89. " #'ty^2 dSlope_xEndT',\n",
  90. " #'ty^2 dSlope_yEndT', \n",
  91. " #'ty^2 dSlope_xEndT_abs',\n",
  92. " #'ty^2 tx dSlope_xEndT',\n",
  93. " #'ty tx^2 dSlope_yEndT',\n",
  94. " 'ty tx^2 dSlope_xEndT_abs', # keep\n",
  95. " 'ty^3 tx dSlope_xEndT', #keep\n",
  96. " #'ty tx^3 dSlope_xEndT',\n",
  97. " #'ty^3 dSlope_yEndT_abs',\n",
  98. "]\n",
  99. "do_not_keep = [\n",
  100. " 'dSlope_xEndT',\n",
  101. " 'dSlope_yEndT_abs',\n",
  102. " 'ty dSlope_xEndT',\n",
  103. " 'tx dSlope_xEndT',\n",
  104. " 'tx dSlope_xEndT_abs',\n",
  105. " 'tx dSlope_yEndT',\n",
  106. " 'ty^2 dSlope_xEndT',\n",
  107. " 'ty^3 dSlope_yEndT_abs',\n",
  108. " 'ty tx dSlope_yEndT',\n",
  109. " 'ty tx^3 dSlope_xEndT',\n",
  110. " 'ty tx^2 dSlope_yEndT',\n",
  111. "]\n",
  112. "reduce = True\n",
  113. "if reduce:\n",
  114. " remove = [i for i, f in enumerate(poly_features) if (keep and f not in keep )]\n",
  115. " X_train_model = np.delete( X_train_model, remove, axis=1)\n",
  116. " X_test_model = np.delete( X_test_model, remove, axis=1)\n",
  117. " poly_features = np.delete(poly_features, remove )\n",
  118. " print(poly_features)\n",
  119. "if not reduce:\n",
  120. " remove = [i for i, f in enumerate(poly_features) if (\"dSlope_\" not in f) or (\"EndT^\" in f) or (\"abs^\" in f) or (\"EndT dSlope\" in f) or (\"abs dSlope\" in f)]\n",
  121. " X_train_model = np.delete( X_train_model, remove, axis=1)\n",
  122. " X_test_model = np.delete( X_test_model, remove, axis=1)\n",
  123. " poly_features = np.delete(poly_features, remove )\n",
  124. " #print(poly_features)\n",
  125. " lin_reg = Lasso(fit_intercept=False, alpha=0.000001)\n",
  126. "else:\n",
  127. " lin_reg = LinearRegression(fit_intercept=False)\n",
  128. "lin_reg.fit( X_train_model, y_train)\n",
  129. "y_pred_test = lin_reg.predict( X_test_model )\n",
  130. "print(\"intercept=\", lin_reg.intercept_)\n",
  131. "print(\"coef=\", {k: v for k, v in zip(poly_features, lin_reg.coef_) if abs(v) > 1.0 and k not in keep and k not in do_not_keep})\n",
  132. "print(\"r2 score=\", lin_reg.score(X_test_model, y_test))\n",
  133. "print(\"RMSE =\", mean_squared_error(y_test, y_pred_test, squared=False))\n",
  134. "print(\"straight RMSE =\", mean_squared_error(array[\"y_l11\"], array[\"y\"] + array[\"ty\"] * ( array[\"z_l11\"] - array[\"z\"] ), squared=False))\n",
  135. "print(format_array(\"y_xEndT_diff\", lin_reg.coef_))"
  136. ]
  137. },
  138. {
  139. "cell_type": "code",
  140. "execution_count": null,
  141. "metadata": {},
  142. "outputs": [],
  143. "source": []
  144. }
  145. ],
  146. "metadata": {
  147. "kernelspec": {
  148. "display_name": "Python 3.10.6 (conda)",
  149. "language": "python",
  150. "name": "python3"
  151. },
  152. "language_info": {
  153. "codemirror_mode": {
  154. "name": "ipython",
  155. "version": 3
  156. },
  157. "file_extension": ".py",
  158. "mimetype": "text/x-python",
  159. "name": "python",
  160. "nbconvert_exporter": "python",
  161. "pygments_lexer": "ipython3",
  162. "version": "3.10.6"
  163. },
  164. "orig_nbformat": 4,
  165. "vscode": {
  166. "interpreter": {
  167. "hash": "a2eff8b4da8b8eebf5ee2e5f811f31a557e0a202b4d2f04f849b065340a6eda6"
  168. }
  169. }
  170. },
  171. "nbformat": 4,
  172. "nbformat_minor": 2
  173. }