You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

91 lines
3.4 KiB

10 months ago
  1. import awkward as ak
  2. from sklearn.preprocessing import PolynomialFeatures
  3. from sklearn.linear_model import LinearRegression
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.metrics import mean_squared_error
  6. import numpy as np
  7. def fit_linear_regression_model(
  8. array: ak.Array,
  9. target_feat: str,
  10. features: list[str],
  11. degree: int,
  12. keep: list[str] = None,
  13. keep_only_linear_in: str = "",
  14. remove: list[str] = None,
  15. include_bias: bool = False,
  16. fit_intercept: bool = False,
  17. test_size=0.2,
  18. random_state=42,
  19. ) -> tuple[LinearRegression, list[str]]:
  20. """Wrapper around sklearn's LinearRegression with PolynomialFeatures.
  21. Args:
  22. array (ak.Array): The data.
  23. target_feat (str): Target feature to be fitted.
  24. features (list[str]): Features the target depends on.
  25. degree (int): Highest order of the polynomial.
  26. keep (list[str], optional): Monomials to keep. Defaults to None.
  27. keep_only_linear_in (str, optional): Keep only terms that are linear in this feature. Defaults to "".
  28. remove (list[str], optional): Monomials to remove. Defaults to None.
  29. include_bias (bool, optional): Inlcude bias term in polynomial. Defaults to False.
  30. fit_intercept (bool, optional): Fit zeroth order. Defaults to False.
  31. test_size (float, optional): Fraction of data used for testing. Defaults to 0.2.
  32. random_state (int, optional): Defaults to 42.
  33. Raises:
  34. NotImplementedError: Simultaneous removing and keeping is not implemented.
  35. Returns:
  36. tuple[LinearRegression, list[str]]: The linear regression object and the kept features.
  37. """
  38. data = np.column_stack([ak.to_numpy(array[feat]) for feat in features])
  39. target = ak.to_numpy(array[target_feat])
  40. X_train, X_test, y_train, y_test = train_test_split(
  41. data,
  42. target,
  43. test_size=test_size,
  44. random_state=random_state,
  45. )
  46. poly = PolynomialFeatures(degree=degree, include_bias=include_bias)
  47. X_train_model = poly.fit_transform(X_train)
  48. X_test_model = poly.fit_transform(X_test)
  49. poly_features = poly.get_feature_names_out(input_features=features)
  50. if not remove:
  51. if keep:
  52. remove = [i for i, f in enumerate(poly_features) if f not in keep]
  53. elif keep_only_linear_in:
  54. # remove everything that's not linear in variable
  55. # the corrections should vanish
  56. remove = [
  57. i
  58. for i, f in enumerate(poly_features)
  59. if (keep_only_linear_in not in f) or (keep_only_linear_in + "^" in f)
  60. ]
  61. else:
  62. remove = []
  63. elif remove and keep:
  64. raise NotImplementedError
  65. X_train_model = np.delete(X_train_model, remove, axis=1)
  66. X_test_model = np.delete(X_test_model, remove, axis=1)
  67. poly_features = np.delete(poly_features, remove)
  68. lin_reg = LinearRegression(fit_intercept=fit_intercept)
  69. lin_reg.fit(X_train_model, y_train)
  70. y_pred_test = lin_reg.predict(X_test_model)
  71. print(f"Parameterisation for {target_feat}:")
  72. print("intercept=", lin_reg.intercept_)
  73. print(
  74. "coef=",
  75. dict(
  76. zip(
  77. poly_features,
  78. lin_reg.coef_,
  79. ),
  80. ),
  81. )
  82. print("r2 score=", lin_reg.score(X_test_model, y_test))
  83. print("RMSE =", mean_squared_error(y_test, y_pred_test, squared=False))
  84. print()
  85. return (lin_reg, poly_features)