diff --git a/notebooks/03_ml_basics_ex_1_magic.ipynb b/notebooks/03_ml_basics_ex_1_magic.ipynb new file mode 100644 index 0000000..110f395 --- /dev/null +++ b/notebooks/03_ml_basics_ex_1_magic.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercise: Classification of air showers measured with the MAGIC telescope\n", + "\n", + "The [MAGIC telescope](https://en.wikipedia.org/wiki/MAGIC_(telescope)) is a Cherenkov telescope situated on La Palma, one of the Canary Islands. The [MAGIC machine learning dataset](https://archive.ics.uci.edu/ml/datasets/magic+gamma+telescope) can be obtained from [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).\n", + "\n", + "The task is to separate signal events (gamma showers) and background events (hadron showers) based on the features of a measured Cherenkov shower.\n", + "\n", + "The features of a shower are:\n", + "\n", + " 1. fLength: continuous # major axis of ellipse [mm]\n", + " 2. fWidth: continuous # minor axis of ellipse [mm] \n", + " 3. fSize: continuous # 10-log of sum of content of all pixels [in #phot]\n", + " 4. fConc: continuous # ratio of sum of two highest pixels over fSize [ratio]\n", + " 5. fConc1: continuous # ratio of highest pixel over fSize [ratio]\n", + " 6. fAsym: continuous # distance from highest pixel to center, projected onto major axis [mm]\n", + " 7. fM3Long: continuous # 3rd root of third moment along major axis [mm] \n", + " 8. fM3Trans: continuous # 3rd root of third moment along minor axis [mm]\n", + " 9. fAlpha: continuous # angle of major axis with vector to origin [deg]\n", + " 10. fDist: continuous # distance from origin to center of ellipse [mm]\n", + " 11. class: g,h # gamma (signal), hadron (background)\n", + "\n", + "g = gamma (signal): 12332\n", + "h = hadron (background): 6688\n", + "\n", + "For technical reasons, the number of h events is underestimated.\n", + "In the real data, the h class represents the majority of the events.\n", + "\n", + "You can find further information about the MAGIC telescope and the data discrimination studies in the following [paper](https://reader.elsevier.com/reader/sd/pii/S0168900203025051?token=8A02764E2448BDC5E4DD0ED53A301295162A6E9C8F223378E8CF80B187DBFD98BD3B642AB83886944002206EB1688FF4) (R. K. Bock et al., \"Methods for multidimensional event classification: a case studyusing images from a Cherenkov gamma-ray telescope\" NIM A 516 (2004) 511-528) (You need to be within the university network to get free access.) " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "filename = \"https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/data/magic04_data.txt\"\n", + "df = pd.read_csv(filename, engine='python')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# use categories 1 and 0 insted of \"g\" and \"h\"\n", + "df['class'] = df['class'].map({'g': 1, 'h': 0})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### a) Create for each variable a figure with a plot for gammas and hadrons overlayed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df0 = df[df['class'] == 0] # hadron data set\n", + "df1 = df[df['class'] == 1] # gamma data set\n", + "\n", + "print(len(df0),len(df1))\n", + "\n", + "### YOUR CODE ###\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### b) Create training and test data set. The tast data should amount to 50\\% of the total data set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y = df['class'].values\n", + "X = df[[col for col in df.columns if col!=\"class\"]]\n", + "\n", + "### YOUR CODE ### \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### c) Define the logistic regressor and fit the training data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import linear_model\n", + "\n", + "# define logistic regressor\n", + "\n", + "### YOUR CODE ###\n", + "\n", + "\n", + "\n", + "# fit training data\n", + "\n", + "### YOUR CODE ###\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### d) Determine the Model Accuracy, the AUC score and the Run time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import roc_auc_score\n", + "\n", + "### YOUR CODE ###\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### e) Plot the ROC curve (Backgropund Rejection vs signal efficiency)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.metrics import roc_curve\n", + "%matplotlib inline\n", + "\n", + "y_pred_prob = logreg.predict_proba(X_test) # predicted probabilities\n", + "\n", + "### YOUR CODE ###\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### YOUR CODE ###\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/03_ml_basics_iris_softmax_regression.ipynb b/notebooks/03_ml_basics_iris_softmax_regression.ipynb new file mode 100644 index 0000000..f8240af --- /dev/null +++ b/notebooks/03_ml_basics_iris_softmax_regression.ipynb @@ -0,0 +1,383 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Simple classification example: the iris dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "from sklearn import datasets\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import confusion_matrix" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import some data to play with\n", + "# columns: Sepal Length, Sepal Width, Petal Length and Petal Width\n", + "iris = datasets.load_iris()\n", + "X = iris.data\n", + "y = iris.target" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Sepal Length (cm)Sepal Width (cm)Petal Length (cm)Petal Width (cm)category
05.13.51.40.20
14.93.01.40.20
24.73.21.30.20
34.63.11.50.20
45.03.61.40.20
\n", + "
" + ], + "text/plain": [ + " Sepal Length (cm) Sepal Width (cm) Petal Length (cm) Petal Width (cm) \\\n", + "0 5.1 3.5 1.4 0.2 \n", + "1 4.9 3.0 1.4 0.2 \n", + "2 4.7 3.2 1.3 0.2 \n", + "3 4.6 3.1 1.5 0.2 \n", + "4 5.0 3.6 1.4 0.2 \n", + "\n", + " category \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# just to create a nice table\n", + "df = pd.DataFrame({\"Sepal Length (cm)\": X[:,0], \"Sepal Width (cm)\": X[:,1], \n", + " 'Petal Length (cm)': X[:,2], 'Petal Width (cm)': X[:,3], \n", + " 'category': y})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['setosa', 'versicolor', 'virginica']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(iris.target_names)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# split data into training and test data sets\n", + "x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0, 0.5, 'Petal width')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# plot with color code\n", + "plt.subplots(1, 2, figsize=(10, 5))\n", + "\n", + "plt.subplot(1, 2, 1)\n", + "plt.scatter(X[:, 0], X[:, 1], c=y, edgecolor='k')\n", + "plt.xlabel('Sepal length')\n", + "plt.ylabel('Sepal width')\n", + "\n", + "plt.subplot(1, 2, 2)\n", + "plt.scatter(X[:, 2], X[:, 3], c=y, edgecolor='k')\n", + "plt.xlabel('Petal length')\n", + "plt.ylabel('Petal width')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Softmax regression" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/local/home/marks/anaconda3/envs/myML/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "log_reg = LogisticRegression(multi_class='multinomial', penalty='none')\n", + "log_reg.fit(x_train, y_train);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## k-nearest neighbor" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.neighbors import KNeighborsClassifier\n", + "kn_neigh = KNeighborsClassifier(n_neighbors=5)\n", + "kn_neigh.fit(x_train, y_train);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fisher linear discriminant" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", + "fisher_ld = LinearDiscriminantAnalysis()\n", + "fisher_ld.fit(x_train, y_train);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LogisticRegression\n", + "accuracy: 0.96\n", + "[[29 0 0]\n", + " [ 0 23 0]\n", + " [ 0 3 20]] \n", + "\n", + "KNeighborsClassifier\n", + "accuracy: 0.95\n", + "[[29 0 0]\n", + " [ 0 23 0]\n", + " [ 0 4 19]] \n", + "\n", + "LinearDiscriminantAnalysis\n", + "accuracy: 0.99\n", + "[[29 0 0]\n", + " [ 0 23 0]\n", + " [ 0 1 22]] \n", + "\n" + ] + } + ], + "source": [ + "for clf in [log_reg, kn_neigh, fisher_ld]:\n", + " y_pred = clf.predict(x_test)\n", + " acc = accuracy_score(y_test, y_pred)\n", + " print(type(clf).__name__)\n", + " print(f\"accuracy: {acc:0.2f}\")\n", + " \n", + " # confusion matrix: columns: true class, row: predicted class\n", + " print(confusion_matrix(y_test, y_pred),\"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 29\n", + " 1 0.88 1.00 0.94 23\n", + " 2 1.00 0.87 0.93 23\n", + "\n", + " accuracy 0.96 75\n", + " macro avg 0.96 0.96 0.96 75\n", + "weighted avg 0.96 0.96 0.96 75\n", + "\n" + ] + } + ], + "source": [ + "y_pred = log_reg.predict(x_test)\n", + "print(classification_report(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/03_ml_basics_log_regr_heart_disease.ipynb b/notebooks/03_ml_basics_log_regr_heart_disease.ipynb new file mode 100644 index 0000000..237da7f --- /dev/null +++ b/notebooks/03_ml_basics_log_regr_heart_disease.ipynb @@ -0,0 +1,502 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Logistic regression with scikit-learn: heart disease data set" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read data " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
agesexcptrestbpscholfbsrestecgthalachexangoldpeakslopecathaltarget
063131452331015002.30011
137121302500118703.50021
241011302040017201.42021
356111202360117800.82021
457001203540116310.62021
.............................................
29857001402410112310.21030
29945131102640113201.21030
30068101441931114103.41230
30157101301310111511.21130
30257011302360017400.01120
\n", + "

303 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " age sex cp trestbps chol fbs restecg thalach exang oldpeak \\\n", + "0 63 1 3 145 233 1 0 150 0 2.3 \n", + "1 37 1 2 130 250 0 1 187 0 3.5 \n", + "2 41 0 1 130 204 0 0 172 0 1.4 \n", + "3 56 1 1 120 236 0 1 178 0 0.8 \n", + "4 57 0 0 120 354 0 1 163 1 0.6 \n", + ".. ... ... .. ... ... ... ... ... ... ... \n", + "298 57 0 0 140 241 0 1 123 1 0.2 \n", + "299 45 1 3 110 264 0 1 132 0 1.2 \n", + "300 68 1 0 144 193 1 1 141 0 3.4 \n", + "301 57 1 0 130 131 0 1 115 1 1.2 \n", + "302 57 0 1 130 236 0 0 174 0 0.0 \n", + "\n", + " slope ca thal target \n", + "0 0 0 1 1 \n", + "1 0 0 2 1 \n", + "2 2 0 2 1 \n", + "3 2 0 2 1 \n", + "4 2 0 2 1 \n", + ".. ... .. ... ... \n", + "298 1 0 3 0 \n", + "299 1 0 3 0 \n", + "300 1 2 3 0 \n", + "301 1 1 3 0 \n", + "302 1 1 2 0 \n", + "\n", + "[303 rows x 14 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# filename = \"heart.csv\"\n", + "filename = \"https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/data/heart.csv\"\n", + "df = pd.read_csv(filename)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "y = df['target'].values\n", + "X = df[[col for col in df.columns if col!=\"target\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, shuffle=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fit the model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/local/home/marks/anaconda3/envs/myML/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py:1173: FutureWarning: `penalty='none'`has been deprecated in 1.2 and will be removed in 1.4. To keep the past behaviour, set `penalty=None`.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
LogisticRegression(max_iter=5000, penalty='none', tol=1e-05)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LogisticRegression(max_iter=5000, penalty='none', tol=1e-05)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "lr = LogisticRegression(penalty='none', fit_intercept=True, max_iter=5000, tol=1E-5)\n", + "lr.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test predictions on test data set" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.76 0.79 76\n", + " 1 0.78 0.83 0.80 76\n", + "\n", + " accuracy 0.80 152\n", + " macro avg 0.80 0.80 0.80 152\n", + "weighted avg 0.80 0.80 0.80 152\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.metrics import classification_report\n", + "y_pred_lr = lr.predict(X_test)\n", + "print(classification_report(y_test, y_pred_lr))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compare two classifiers using the ROC curve" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestClassifier\n", + "rf = RandomForestClassifier(max_depth=3)\n", + "rf.fit(X_train, y_train)\n", + "y_pred_rf = rf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import roc_curve\n", + "\n", + "y_pred_prob_lr = lr.predict_proba(X_test) # predicted probabilities\n", + "fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_prob_lr[:,1])\n", + "\n", + "y_pred_prob_rf = rf.predict_proba(X_test) # predicted probabilities\n", + "fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_prob_rf[:,1])\n", + "\n", + "plt.plot(tpr_lr, 1-fpr_lr, label=\"log. regression\")\n", + "plt.plot(tpr_rf, 1-fpr_rf, label=\"random forest\")\n", + "\n", + "plt.xlabel('Recall', fontsize=18)\n", + "plt.ylabel('Precision', fontsize=18);\n", + "plt.legend(fontsize=15)\n", + "\n", + "plt.savefig(\"03_ml_basics_log_regr_heart_disease.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Area under Curve (AUC) scores: 0.80, 0.80\n" + ] + } + ], + "source": [ + "from sklearn.metrics import roc_auc_score\n", + "auc_lr = roc_auc_score(y_test,y_pred_lr)\n", + "auc_rf = roc_auc_score(y_test,y_pred_rf)\n", + "print(f\"Area under Curve (AUC) scores: {auc_lr:.2f}, {auc_rf:.2f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/03_ml_basics_logistic_regression.ipynb b/notebooks/03_ml_basics_logistic_regression.ipynb new file mode 100644 index 0000000..d628cce --- /dev/null +++ b/notebooks/03_ml_basics_logistic_regression.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Simple example of logistic regression with scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read data \n", + "Data are from the [wikipedia article on logistic regression](https://en.wikipedia.org/wiki/Logistic_regression)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# data: 1. hours studies, 2. passed (0/1) \n", + "filename = \"https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/data/exam.txt\"\n", + "df = pd.read_csv(filename, engine='python', sep='\\s+')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "x_tmp = df['hours_studied'].values\n", + "x = np.reshape(x_tmp, (-1, 1))\n", + "y = df['passed'].values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fit the model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "clf = LogisticRegression(penalty='none', fit_intercept=True)\n", + "clf.fit(x, y);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Calculate predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "hours_studied_tmp = np.linspace(0., 6., 1000)\n", + "hours_studied = np.reshape(hours_studied_tmp, (-1, 1))\n", + "y_pred = clf.predict_proba(hours_studied)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Plot result" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df.plot.scatter(x='hours_studied', y='passed')\n", + "plt.plot(hours_studied, y_pred[:,1])\n", + "plt.xlabel(\"preparation time in hours\", fontsize=14)\n", + "plt.ylabel(\"probability of passing exam\", fontsize=14)\n", + "plt.savefig(\"03_ml_basics_logistic_regression.pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'C': 1.0,\n", + " 'class_weight': None,\n", + " 'dual': False,\n", + " 'fit_intercept': True,\n", + " 'intercept_scaling': 1,\n", + " 'l1_ratio': None,\n", + " 'max_iter': 100,\n", + " 'multi_class': 'auto',\n", + " 'n_jobs': None,\n", + " 'penalty': 'none',\n", + " 'random_state': None,\n", + " 'solver': 'lbfgs',\n", + " 'tol': 0.0001,\n", + " 'verbose': 0,\n", + " 'warm_start': False}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.get_params()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Coefficient: [[1.50464522]]\n", + "Intercept: [-4.07771764]\n" + ] + } + ], + "source": [ + "print('Coefficient: ', clf.coef_)\n", + "print('Intercept: ', clf.intercept_)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}