update files

update
2023-04-03 13:08:49 +02:00 · 2023-04-03 13:04:54 +02:00
14 changed files with 3837 additions and 0 deletions
--- a/notebooks/01_intro_ex_1a_sol.ipynb
+++ b/notebooks/01_intro_ex_1a_sol.ipynb
@ -0,0 +1,139 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exercise 1: Create numpy array and draw rgb color objects"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "create data array 2x2 as pixel position and 1x3 as rgb color data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "width, height = 200, 200\n",
    "data = np.zeros((height, width, 3), dtype=np.uint8)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "draw blue cross"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x =  np.arange(width)\n",
    "x_1 = np.arange(width)\n",
    "x_2 = np.arange(width-1,-1,-1)\n",
    "y = np.arange(height)\n",
    "data[x_1,y] = [0,0,255]\n",
    "data[x_2,y] = [0,0,255]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " draw a square "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lower = 55\n",
    "upper = 75\n",
    "data[lower:upper,lower:upper] = [0,255,0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "create a mask of a circle using indexing\n",
    "np.newaxis adds another dimension\n",
    "we create a row and column vector and fill it using the condition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_center = 100\n",
    "y_center = 100\n",
    "radius = 10\n",
    "mask = (x[np.newaxis,:]-x_center)**2 + (y[:,np.newaxis]-y_center)**2 < radius**2\n",
    "data[mask] = [255,0,0]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot image\n",
    "plt.figure(figsize=(4.,4.),dpi=100,facecolor='lightgrey')\n",
    "plt.imshow(data)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/notebooks/01_intro_ex_1b_sol.ipynb
+++ b/notebooks/01_intro_ex_1b_sol.ipynb
@ -0,0 +1,133 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exercise 1b: Read a binary file which contains pixel data and apply\n",
    "transformations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load figure as 2D array \n",
    "data = np.load('horse.npy')\n",
    "print(data.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# just scale the data by a factor and shift by trans\n",
    "trans = np.ones(data.shape)\n",
    "trans[0,:] *=0.6\n",
    "trans[1,:] *=0.4\n",
    "factor = 0.5 \n",
    "data_scale = data * factor + trans"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#compression in x and y \n",
    "sx = 0.4\n",
    "sy = 0.9\n",
    "t = np.array([[sx,0],[0,sy]])\n",
    "data_comp = t@data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#rotation by an angle theta\n",
    "theta = 0.5\n",
    "data_rot = np.array([[np.cos(theta),-np.sin(theta)],[np.sin(theta), np.cos(theta)]])@data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#spiegelung an der x Achse\n",
    "tx = np.array([[1,0],[0,-1]])  # mirror x axis\n",
    "ty = np.array([[-1,0],[0,1]])  # mirror y axis\n",
    "tp = np.array([[-1,0],[0,-1]]) # mirror (0,0)\n",
    "data_mirror = tp@data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create figure for the transformations\n",
    "plt.figure(figsize=(10.0,10.0),dpi=100,facecolor='lightgrey')\n",
    "plt.suptitle('Plot Transformations')\n",
    "plt.subplot(2,2,1)\n",
    "plt.title('original picture')\n",
    "plt.plot(data[0,:],data[1,:],'.')\n",
    "plt.axis([-1.2,1.2,-1.2,1.2])\n",
    "plt.subplot(2,2,2)\n",
    "plt.title('scaling and translation')\n",
    "plt.plot(data_scale[0,:],data_scale[1,:],'.')\n",
    "plt.axis([-1.2,1.2,-1.2,1.2])\n",
    "plt.subplot(2,2,3)\n",
    "plt.title('compression')\n",
    "plt.plot(data_comp[0,:],data_comp[1,:],'.')\n",
    "plt.axis([-1.2,1.2,-1.2,1.2])\n",
    "plt.subplot(2,2,4)\n",
    "plt.title('rotation and mirror at p(0,0)')\n",
    "plt.plot(data_rot[0,:],data_rot[1,:],'.')\n",
    "plt.plot(data_mirror[0,:],data_mirror[1,:],'.')\n",
    "plt.axis([-1.2,1.2,-1.2,1.2])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/notebooks/01_intro_ex_2_sol.ipynb
+++ b/notebooks/01_intro_ex_2_sol.ipynb
--- a/notebooks/02_fit_fitGraph.ipynb
+++ b/notebooks/02_fit_fitGraph.ipynb
--- a/notebooks/02_fit_iminuitFit.ipynb
+++ b/notebooks/02_fit_iminuitFit.ipynb
@ -0,0 +1,291 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit with the python interface to Minuit 2 called iminuit\n",
    "https://iminuit.readthedocs.io/en/stable/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib import pyplot as plt\n",
    "plt.rcParams[\"font.size\"] = 20\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype='d')\n",
    "dx =  np.array([0.1,0.1,0.5,0.1,0.5,0.1,0.5,0.1,0.5,0.1], dtype='d')\n",
    "y = np.array([1.1 ,2.3 ,2.7 ,3.2 ,3.1 ,2.4 ,1.7 ,1.5 ,1.5  ,1.7 ], dtype='d')\n",
    "dy = np.array([0.15,0.22,0.29,0.39,0.31,0.21,0.13,0.15,0.19,0.13], dtype='d')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define fit function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pol3(a0, a1, a2, a3):\n",
    "    return a0 + x*a1 + a2*x**2 + a3*x**3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "least-squares function: sum of data residuals squared"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def LSQ(a0, a1, a2, a3):\n",
    "    return np.sum((y - pol3(a0, a1, a2, a3)) ** 2 / dy ** 2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "import Minuit object"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from iminuit import Minuit"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Minuit instance using LSQ function to minimize"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "LSQ.errordef = Minuit.LEAST_SQUARES\n",
    "#LSQ.errordef = Minuit.LIKELIHOOD\n",
    "m = Minuit(LSQ,a0=-1.3, a1=2.6 ,a2=-0.24 ,a3=0.005)\n",
    "m.fixed[\"a3\"] = True \n",
    "m.params"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "run migrad"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m.fixed[\"a3\"] = False\n",
    "m.params\n",
    "m.migrad()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get contour"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m.draw_mncontour(\"a2\", \"a3\",  cl=[1, 2, 3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Improve the fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m.hesse()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m.minos()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "access fit results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(m.values,m.errors)\n",
    "a0_fit = m.values[\"a0\"]\n",
    "a1_fit = m.values[\"a1\"]\n",
    "a2_fit = m.values[\"a2\"]\n",
    "a3_fit = m.values[\"a3\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print (m.covariance)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "prepare data to display fitted function "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_plot = np.linspace( 0.5, 10.5 , 500 )\n",
    "y_fit = a0_fit + a1_fit * x_plot + a2_fit * x_plot**2 +  a3_fit * x_plot**3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The Minos algorithm uses the profile likelihood method to compute (generally asymmetric) confidence intervals. This can be plotted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m.draw_profile(\"a2\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get a 2D contour of the function around the minimum for 2 parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "m.draw_mncontour(\"a2\", \"a3\" , cl=[1, 2, 3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "lotlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()\n",
    "plt.errorbar(x, y, dy , dx, fmt=\"o\")\n",
    "plt.plot(x_plot, y_fit)\n",
    "plt.title(\"iminuit Fit Test\")\n",
    "plt.xlim(-0.1, 10.1)\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
--- a/notebooks/02_fit_numpyFit.ipynb
+++ b/notebooks/02_fit_numpyFit.ipynb
--- a/notebooks/03_ml_basics_display_Clothing.ipynb
+++ b/notebooks/03_ml_basics_display_Clothing.ipynb
@ -0,0 +1,122 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8f9f0e7b",
   "metadata": {},
   "source": [
    "Display fashion_mnist dataset of clothes from Zalando"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc829d9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63348efe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the MNIST Fashion dataset\n",
    "(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()\n",
    "# Set the class names\n",
    "class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', \n",
    "               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a6c86027",
   "metadata": {},
   "outputs": [],
   "source": [
    "# print the shape of the numpy arrays\n",
    "print ('Print shape of pixel data')\n",
    "print(x_train.shape)\n",
    "print ('Print shape of labels')\n",
    "print(y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc58b142",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Normalize pixel values to between 0 and 1\n",
    "x_train = x_train.astype(\"float32\") / 255.0\n",
    "x_test = x_test.astype(\"float32\") / 255.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c7976111",
   "metadata": {},
   "outputs": [],
   "source": [
    "# choose an image num  to print\n",
    "num = 20\n",
    "image = x_train[num]\n",
    "label = y_train[num]\n",
    "\n",
    "print ('Print normailzed pixel data of image ',num, ' :')\n",
    "print(x_train[num])\n",
    "print ('Print label of image ',num ,  ' :' )\n",
    "print(y_train[num])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "64a46625",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10,10))\n",
    "for i in range(25):\n",
    "    plt.subplot(5,5,i+1)\n",
    "    plt.xticks([])\n",
    "    plt.yticks([])\n",
    "    plt.grid(False)\n",
    "    plt.imshow(x_train[i], cmap=plt.cm.binary)\n",
    "    plt.xlabel(class_names[y_train[i]])\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/03_ml_basics_display_HandWrt.ipynb
+++ b/notebooks/03_ml_basics_display_HandWrt.ipynb
@ -0,0 +1,134 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3644475e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display hand writing dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8125479b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d45b964f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load training dataset of 60000 images with greyscale values in 28 x 28\n",
    "# and labels \n",
    "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fa8ae2a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# print the shape of the numpy arrays\n",
    "print ('Print shape of pixel data')\n",
    "print(x_train.shape)\n",
    "print ('Print shape of labels')\n",
    "print(y_train.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "be70973e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# normalize pixel to 0-1\n",
    "x_train = x_train / 255\n",
    "x_test = x_test / 255"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55f457d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# choose an image num  to display and print\n",
    "num = 20\n",
    "\n",
    "image = x_train[num]\n",
    "label = y_train[num]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "149788b7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot the image using imshow\n",
    "plt.imshow(image, cmap='gray')\n",
    "# set the title\n",
    "plt.title(\"Label: %d\" % label )\n",
    "# remove the axis labels and ticks\n",
    "plt.axis('off')\n",
    "# show the plot\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "232ef6ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot 16 examples from the numpy array which was read in above\n",
    "# and display it\n",
    "fig, axes = plt.subplots(4, 4, figsize=(10, 10))\n",
    "for i , ax in enumerate(axes.ravel()):\n",
    "    ax.imshow(x_train[num+i], cmap='gray')\n",
    "    ax.set_title(\"Label: %d\" % y_train[num+i])\n",
    "    ax.axis('off')\n",
    "plt.suptitle(\"Examples of training set images\")\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/03_ml_basics_display_HorseOrHuman.ipynb
+++ b/notebooks/03_ml_basics_display_HorseOrHuman.ipynb
@ -0,0 +1,197 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "2eaba66b",
   "metadata": {},
   "source": [
    "Read and Display Horse or Human machine learning dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1e48ac0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import tensorflow_datasets as tfds\n",
    "from tensorflow.keras import regularizers\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "feda024e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the horse or human dataset\n",
    "#(300, 300, 3) unint8\n",
    "dataset, label = tfds.load('horses_or_humans', with_info=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "35991dec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract the horse/human class\n",
    "horse_ds = dataset['train'].filter(lambda x: x['label'] == 0)\n",
    "human_ds = dataset['train'].filter(lambda x: x['label'] == 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fab03aa8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Take a few examples < 16\n",
    "n_examples = 5\n",
    "horse_examples = horse_ds.take(n_examples)\n",
    "human_examples = human_ds.take(n_examples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c33f1acd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display the examples\n",
    "fig, axes = plt.subplots(1, n_examples, figsize=(15, 15))\n",
    "for i, example in enumerate(human_examples):\n",
    "    image = example['image']\n",
    "    axes[i].imshow(image)\n",
    "    axes[i].set_title(f\"humans {i+1}\")\n",
    "plt.show()\n",
    "\n",
    "fig, axes = plt.subplots(1, n_examples, figsize=(15, 15))\n",
    "for i, example in enumerate(horse_examples):\n",
    "    image = example['image']\n",
    "    axes[i].imshow(image)\n",
    "    axes[i].set_title(f\"horses {i+1}\")\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25f3eeb3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split the dataset into training and validation sets\n",
    "# as_supervised: Specifies whether to return the dataset as a tuple\n",
    "# of (input, label) pairs.\n",
    "train_dataset, valid_dataset = tfds.load('horses_or_humans', split=['train','test'], as_supervised=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "29dc0e62",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the number of elements in the training and validation dataset\n",
    "train_size = tf.data.experimental.cardinality(train_dataset).numpy()\n",
    "valid_size = tf.data.experimental.cardinality(valid_dataset).numpy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db8aaf91",
   "metadata": {},
   "outputs": [],
   "source": [
    "IMG_SIZE = 300\n",
    "NUM_CLASSES = 2\n",
    "\n",
    "def preprocess(image, label):\n",
    "    image = tf.cast(image, tf.float32)\n",
    "#    # Resize the images to a fixed size\n",
    "    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))\n",
    "#    # Rescale the pixel values to be between 0 and 1\n",
    "    image = image / 255.0\n",
    "    label = tf.one_hot(label, NUM_CLASSES)\n",
    "    return image, label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d59661c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply the preprocessing function to the datasets\n",
    "train_dataset = train_dataset.map(preprocess)\n",
    "valid_dataset = valid_dataset.map(preprocess)\n",
    "\n",
    "# Batch and shuffle the datasets\n",
    "train_dataset = train_dataset.shuffle(2000).batch(80)\n",
    "valid_dataset = valid_dataset.batch(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9399bc99",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the number of elements in the trainingand validation dataset\n",
    "train_size = tf.data.experimental.cardinality(train_dataset).numpy()\n",
    "valid_size = tf.data.experimental.cardinality(valid_dataset).numpy()\n",
    "print(\"Training dataset size:\", train_size)\n",
    "print(\"Validation dataset size:\", valid_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13af7d53",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Store images and labels of the validation data for predictions\n",
    "for images, labels in valid_dataset:\n",
    "    x_val = images\n",
    "    y_val = labels\n",
    "    \n",
    "print(x_val.shape, y_val.shape)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/03_ml_basics_ex_4_mlp_clothing.ipynb
+++ b/notebooks/03_ml_basics_ex_4_mlp_clothing.ipynb
@ -0,0 +1,236 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6c180d4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Exercise 3\n",
    "# fashion mnist data\n",
    "# MLP model with two hidden layers, each with a ReLU activation function.\n",
    "# Input data is flattened to a 1D array and passed to the model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b0e31b9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ae1412e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the MNIST Fashion dataset\n",
    "(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8814914",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Normalize pixel values to between 0 and 1\n",
    "x_train = x_train.astype(\"float32\") / 255.0\n",
    "x_test = x_test.astype(\"float32\") / 255.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2810da39",
   "metadata": {},
   "outputs": [],
   "source": [
    "# MNIST dataset images have a shape of (28, 28). The images are flattened\n",
    "# into a 1D array of length 784 \n",
    "x_train = x_train.reshape(-1, 784)\n",
    "x_test = x_test.reshape(-1, 784)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96f7ff8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# The model is defined here with three dense (fully connected) layers\n",
    "# The first layer is a Dense layer with 128 units and a ReLU activation\n",
    "# function with an input shape of (784,). This layer serves as the input\n",
    "# layer of the model.\n",
    "# The second layer is also a Dense layer with 64 units and a ReLU activation\n",
    "# function. This layer takes the output of the previous layer as input, and\n",
    "# applies a non-linear transformation to it to produce a new set of features\n",
    "# that the next layer can use.\n",
    "# The third is another Dense layer, one for each class in the output. The\n",
    "# output is raw scores or logits for each class since there is no activation\n",
    "# function . This layer is responsible for producing the final output of the\n",
    "# model, which can then be used to make predictions.\n",
    "# With Dropout(0.2) 20 % of the input is randomly droped, this should reduce overfitting\n",
    "model = keras.Sequential([\n",
    "    keras.layers.Dense(128, activation='relu', input_shape=(784,)),\n",
    "    # keras.layers.Dropout(0.2),\n",
    "    keras.layers.Dense(64, activation='relu'),\n",
    "    keras.layers.Dense(10)\n",
    "])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a3fe609c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compile the model\n",
    "# adam = specifies the optimizer to use during training\n",
    "# loss function to use during training, SparseCategoricalCrossentropy loss\n",
    "# is commonly used for multi-class classification problems.\n",
    "# from_logits=True indicates that the model's output is a raw score\n",
    "# for each class and not  a probability distribution.\n",
    "model.compile(optimizer='adam',\n",
    "              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
    "              metrics=['accuracy'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf6c978d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train the model\n",
    "history = model.fit(x_train, y_train, epochs=10, validation_split=0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97fc2313",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Evaluate the model on the test set\n",
    "test_loss, test_acc = model.evaluate(x_test, y_test, verbose=2)\n",
    "print(\"Test accuracy:\", test_acc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef5f19d0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot the training and validation accuracy and loss over time\n",
    "plt.figure(figsize=(10, 4))\n",
    "plt.subplot(1, 2, 1)\n",
    "plt.plot(history.history[\"accuracy\"])\n",
    "plt.plot(history.history[\"val_accuracy\"])\n",
    "plt.title(\"Model accuracy\")\n",
    "plt.ylabel(\"Accuracy\")\n",
    "plt.xlabel(\"Epoch\")\n",
    "plt.legend([\"Train\", \"Validation\"], loc=\"lower right\")\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "plt.plot(history.history[\"loss\"])\n",
    "plt.plot(history.history[\"val_loss\"])\n",
    "plt.title(\"Model loss\")\n",
    "plt.ylabel(\"Loss\")\n",
    "plt.xlabel(\"Epoch\")\n",
    "plt.legend([\"Train\", \"Validation\"], loc=\"upper right\")\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0ebddc4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot a confusion matrix of the test set predictions\n",
    "test_preds = np.argmax(model.predict(x_test), axis=1)\n",
    "conf_mat = tf.math.confusion_matrix(y_test, test_preds)\n",
    "plt.imshow(conf_mat, cmap=\"Blues\")\n",
    "plt.xlabel(\"Predicted labels\")\n",
    "plt.ylabel(\"True labels\")\n",
    "plt.xticks(np.arange(10))\n",
    "plt.yticks(np.arange(10))\n",
    "plt.colorbar()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9175d533",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Make predictions on the test set\n",
    "y_pred = model.predict(x_test)\n",
    "y_pred = np.argmax(y_pred, axis=1)\n",
    "\n",
    "# Plot some examples from the test set and their predictions\n",
    "fig, axes = plt.subplots(4, 4, figsize=(18, 18))\n",
    "for i, ax in enumerate(axes.ravel()):\n",
    "    ax.matshow(x_test[i].reshape(28, 28), cmap='gray')\n",
    "    ax.set_title(\"True: %d\\nPredict: %d\" % (y_test[i], y_pred[i]))\n",
    "    ax.axis(\"off\")\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a6e85be",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/03_ml_basics_minimizer.ipynb
+++ b/notebooks/03_ml_basics_minimizer.ipynb
@ -0,0 +1,166 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "042acd49",
   "metadata": {},
   "source": [
    "# Test a minimizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb51a492",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy.optimize import minimize\n",
    "plt.style.use(\"ggplot\")\n",
    "from matplotlib import colors, cm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2ac3651a",
   "metadata": {},
   "source": [
    "plt.rcParams controls the appearance of your plots globally,\n",
    "affecting all subsequent plots created in your session."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97ef9933",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams[\"axes.grid\"] = False\n",
    "plt.rcParams.update({'font.size': 20})\n",
    "plt.rcParams.update({'figure.figsize': (12,9)})\n",
    "plt.rcParams['lines.markersize'] = 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f15200f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate data points with gaussian smearing\n",
    "data = np.random.uniform(size=100)\n",
    "labels = 5.*data*data*data + 1 + np.random.normal(loc=0.0, scale=0.1, size=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7237f5ed",
   "metadata": {},
   "outputs": [],
   "source": [
    "# show plot\n",
    "plt.scatter(data, labels, label=\"data\")\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d6e104c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define chi2 like cost function\n",
    "def cost(params):\n",
    "    W, b = params\n",
    "    return np.mean((labels - (W*data*data*data + b))**2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8e00e16a",
   "metadata": {},
   "source": [
    "call minimizer\n",
    "provides a collection of optimization algorithms for finding the minimum or maximum of a given function. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "433975c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "res = minimize(cost, [1., 1.])\n",
    "# returns an OptimizeResult object\n",
    "# x :the solution (minimum) of the optimization problem, represented as an\n",
    "# array.\n",
    "# Results of the minimization\n",
    "W, b = res.x\n",
    "print ('function value at the minimum and fitted parameters',res.fun,'  ',W,'  ',b)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e1f4e81",
   "metadata": {},
   "outputs": [],
   "source": [
    "points = np.linspace(0, 1, 100)\n",
    "prediction = W*points*points*points + b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d8de971e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot fit model\n",
    "plt.scatter(data, labels, label=\"data\")\n",
    "plt.plot(points, prediction, label=\"model\", color=\"green\")\n",
    "plt.legend()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a7d62c2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/03_ml_basics_tf_broadcasting.ipynb
+++ b/notebooks/03_ml_basics_tf_broadcasting.ipynb
@ -0,0 +1,118 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "df1f5eb3",
   "metadata": {},
   "source": [
    "# demonstration of broadcasting in tensorflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d61c70a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "38bca1cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define two tensors with different shapes\n",
    "a = tf.constant([[1, 2, 3], [4, 5, 6]])\n",
    "b = tf.constant([10, 20, 30])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c3f382e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Perform element-wise multiplication using broadcasting\n",
    "c = a * b\n",
    "# Print the result\n",
    "print(c)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95683fe5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Broadcasting scalar to tensor\n",
    "x = tf.constant([1, 2, 3])\n",
    "y = 2\n",
    "z = x + y  # equivalent to tf.add(x, y)\n",
    "print(z.numpy())  # [3 4 5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ed98565",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Broadcasting vector to matrix\n",
    "x = tf.constant([[1, 2], [3, 4]])\n",
    "y = tf.constant([1, 2])\n",
    "z = x + y  # equivalent to tf.add(x, y)\n",
    "print(z.numpy())  # [[2 4], [4 6]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41f4196f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Broadcasting matrix to tensor\n",
    "x = tf.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])\n",
    "y = tf.constant([[1], [2]])\n",
    "z = x + y  # equivalent to tf.add(x, y)\n",
    "print(z.numpy())  # [[[2 3], [4 5]], [[7 8], [9 10]]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76a5108d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/notebooks/03_ml_basics_tf_differentiate.ipynb
+++ b/notebooks/03_ml_basics_tf_differentiate.ipynb
@ -0,0 +1,102 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eefe7571",
   "metadata": {},
   "outputs": [],
   "source": [
    "# show differentiation in Tensorflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9d7c185",
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "584384f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define a function to differentiate\n",
    "def f(x):\n",
    "    return x ** 2 + 2 * x + 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70430402",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a TensorFlow variable\n",
    "x = tf.Variable(2.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45ea0a33",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use tf.GradientTape to record the gradients\n",
    "with tf.GradientTape() as tape:\n",
    "    y = f(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6b1ff27",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate the gradient of y with respect to x\n",
    "dy_dx = tape.gradient(y, x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4f581817",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Print the result\n",
    "print(dy_dx)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/slides/01_intro_python.md
+++ b/slides/01_intro_python.md
@ -0,0 +1,988 @@
 % Introduction to Data Analysis and Machine Learning in Physics: \ 1. Introduction to python
 % Day 1: 11. April 2023
 % \underline{Jörg Marks}, Klaus Reygers
 ## Outline of the $1^{st}$ day
 * Technical instructions for your interactions with the CIP pool, like
   * using the jupyter hub
   * using python locally in your own linux environment (anaconda)
   * access the CIP pool from your own windows or linux system
   * transfer data from and to the CIP pool
  Can be found in [\textcolor{violet}{CIPpoolAccess.PDF}](https://www.physi.uni-heidelberg.de/~marks/root_einfuehrung/Folien/CIPpoolAccess.PDF)\normalsize
 * Summary of NumPy
 * Plotting with matplotlib
 * Input / output of data
 * Summary of pandas
 * Fitting with iminuit and PyROOT
 ## A glimpse into python classes
 The following python classes are important to \textcolor{red}{data analysis and machine
 learning} and will be useful during the course
 * [\textcolor{violet}{NumPy}](https://numpy.org/doc/stable/user/basics.html) - python library adding support for large,
   multi-dimensional arrays and matrices, along with high-level
   mathematical functions to operate on these arrays
 * [\textcolor{violet}{matplotlib}](https://matplotlib.org/stable/tutorials/index.html) - a python plotting library
 * [\textcolor{violet}{SciPy}](https://docs.scipy.org/doc/scipy/reference/tutorial/index.html) - extension of NumPy by a collection of
   mathematical algorithms for minimization, regression, 
   fourier transformation, linear algebra and image processing
 * [\textcolor{violet}{iminuit}](https://iminuit.readthedocs.io/en/stable/) -
   python wrapper to the data fitting toolkit
   [\textcolor{violet}{Minuit2}](https://root.cern.ch/doc/master/Minuit2Page.html)
   developed at CERN by F. James in the 1970ies 
 * [\textcolor{violet}{PyROOT}](https://root.cern/manual/python/) - python wrapper to the C++ data analysis toolkit
   ROOT [\textcolor{violet}{(lecture WS 2021 / 22)}](https://www.physi.uni-heidelberg.de/~marks/root_einfuehrung/) used at the LHC 
 * [\textcolor{violet}{scikit-learn}](https://scikit-learn.org/stable/) - machine learning library written in
   python, which makes use extensively of NumPy for high-performance
   linear algebra algorithms
 ## NumPy
   \textcolor{blue}{NumPy} (Numerical Python) is an open source python library,
   which contains multidimensional array and matrix data structures and methods
   to efficiently operate on these. The core object is
   a homogeneous n-dimensional array object,  \textcolor{blue}{ndarray}, which
   allows for a wide variety of \textcolor{blue}{fast operations and mathematical calculations
   with arrays and matrices} due to the extensive usage of compiled code.  
   * It is heavily used in numerous scientific python packages
   * `ndarray` 's  have a fixed size at creation $\rightarrow$ changing size
     leads to recreation
   * Array elements are all required to be of the same data type
   * Facilitates advanced mathematical operations on large datasets
   * See for a summary, e.g. &nbsp;&nbsp;  
 \small
 [\textcolor{violet}{https://cs231n.github.io/python-numpy-tutorial/\#numpy}](https://cs231n.github.io/python-numpy-tutorial/#numpy) \normalsize
 \vfill
 ::: columns
 :::: {.column width=30%}
 ::::
 :::
 ::: columns
 :::: {.column width=35%}
 `c = []`
 `for i in range(len(a)):`
 &nbsp;&nbsp;&nbsp; `c.append(a[i]*b[i])`
 ::::
 :::: {.column width=35%}
 with NumPy
 `c = a * b`
 ::::
 :::
 <!---
 It seem we need to indent by hand.
 I don't manage to align under the bullet text
 If we do it with column the vertical space is with code sections not good
 If we do it without code section the vertical space is ok, but there is no
 code high lightning.
 See the different versions of the same page in the following
 -->
 ## NumPy - array basics (1)
 * numpy arrays build a grid of \textcolor{blue}{same type} values, which are indexed.
  The *rank* is the dimension of the array.
  There are methods to create  and preset arrays.
 \footnotesize
 ```python
 	 myA = np.array([12, 5 , 11])            # create rank 1 array (vector like)
 	 type(myA)                               # <class ‘numpy.ndarray’>
 	 myA.shape                               # (3,)
 	 print(myA[2])                           # 11   access 3. element
 	 myA[0] = 12                             # set 1. element to 12
 	 myB = np.array([[1,5],[7,9]])           # create  rank 2 array
 	 myB.shape                               # (2,2)
 	 print(myB[0,0],myB[0,1],myB[1,1])       # 1 5 9
 	 myC = np.arange(6)                      # create rank 1 set to 0 - 5
 	 myC.reshape(2,3)                        # change rank to (2,3)
 	 zero = np.zeros((2,5))                  # 2 rows, 5 columns, set to 0
 	 one = np.ones((2,2))                    # 2 rows, 2 columns, set to 1
 	 five = np.full((2,2), 5)                # 2 rows, 2 columns, set to 5
 	 e = np.eye(2)                           # create 2x2 identity matrix
 ```
 \normalsize
 ## NumPy - array basics (2)
 * Similar to a coordinate system numpy arrays also have \textcolor{blue}{axes}. numpy operations
 can be performed along these axes.
 \footnotesize
 ::: columns
 :::: {.column width=35%}
 ```python
 # 2D arrays 
 five = np.full((2,3), 5)                # 2 rows, 3 columns, set to 5
 seven = np.full((2,3), 7)               # 2 rows, 3 columns, set to 7
 np.concatenate((five,seven), axis = 0)  # results in a 3 x 4 array
 np.concatenate((five,seven), axis = 1]) # results in a 6 x 2 array
 # 1D array
 one = np.array([1, 1 , 1])              # results in a 1 x 3 array, set to 1
 four = np.array([4, 4 , 4])             # results in a 1 x 3 array, set to 4
 np.concatenate((one,four), axis = 0)    # concat. arrays horizontally! 
 ```
 ::::
 :::: {.column width=50%}
 \vspace{3cm}
 ![](figures/numpy_axes.png)
 ::::
 :::
 \normalsize
 ##  NumPy - array indexing (1)
 * select slices of a numpy array
 \footnotesize
 ```python
     a = np.array([[1,2,3,4],
                   [5,6,7,8],                # 3 rows 4 columns array
                   [9,10,11,12]])
     b = a[:2, 1:3]                          # subarray of 2 rows and
         array([[2, 3],                      # column 1 and 2
                [6, 7]])
 ```		    
 \normalsize
 * a slice of an array points into the same data, *modifying* changes the original array!
 \footnotesize
 ```python
     b[0, 0] = 77	                         # b[0,0] and a[0,1] are 77
     r1_row = a[1, :]                        # get 2nd row ->  rank 1
     r1_row.shape	                         # (4,)
     r2_row = a[1:2, :]                      # get 2nd row -> rank 2
     r2_row.shape                            # (1,4)
     a=np.array([[1,2],[3,4],[5,6]])         # set a , 3 rows 2 cols
     d=a[[0, 1, 2], [0, 1, 1]]               # d contains [1 4 6]
     e=a[[1, 2], [1, 1]]                     # e contains [4 6]
     np.array([a[0,0],a[1,1],a[2,0]])        # address elements explicitly
 ```
 \normalsize
 ##  NumPy - array indexing (2)
 * integer array indexing by setting an array of indices $\rightarrow$ selecting/changing elements
 \footnotesize
 ```python
     a = np.array([[1,2,3,4],
                   [5,6,7,8],                # 3 rows 4 columns array
                   [9,10,11,12]])
     p_a = np.array([0,2,0])                 # Create an array of indices
     s = a[np.arange(3), p_a]                # number the rows, p_a points to cols
     print (s)                               # s contains [1 7 9]
     a[np.arange(3),p_a] += 10               # add 10 to corresponding elements
     x=np.array([[8,2],[7,4]])               # create 2x2 array
     bool = (x > 5)                          # bool : array of boolians
                                             #   [[True False]
                                             #    [True False]]
     print(x[x>5])                           # select elements, prints [8 7]
 ```		    
 \normalsize
 * data type in numpy - create according to input numbers or set explicitly
 \footnotesize
 ```python
     x = np.array([1.1, 2.1])                # create float array 
     print(x.dtype)                          # print  float64
     y=np.array([1.1,2.9],dtype=np.int64)    # create float array [1 2]
 ```
 \normalsize
 ## NumPy - functions
 * math functions operate elementwise either as operator overload or as methods
 \footnotesize
 ```python
     x=np.array([[1,2],[3,4]],dtype=np.float64)    # define 2x2 float array
     y=np.array([[3,1],[5,1]],dtype=np.float64)    # define 2x2 float array
     s = x + y                                     # elementwise sum 
     s = np.add(x,y)
     s = np.subtract(x,y)
     s = np.multiply(x,y)                          # no matrix multiplication!
     s = np.divide(x,y)
     s = np.sqrt(x), np.exp(x), ...
     x @ y , or np.dot(x, y)                       # matrix product
     np.sum(x, axis=0)                             # sum of each column
     np.sum(x, axis=1)                             # sum of each row
     xT = x.T                                      # transpose of x
     x = np.linspace(0,2*pi,100)                   # get equal spaced points in x
     r = np.random.default_rng(seed=42)            # constructor random number class
     b = r.random((2,3))                           # random 2x3 matrix
 ```
 \normalsize
 ##
 *  broadcasting in  numpy
  \vspace{0.4cm}
   The term \textcolor{blue}{broadcasting} describes how numpy treats arrays
   with different shapes during arithmetic operations
   * add a scalar $b$ to a 1D array $a = [a_1,a_2,a_3]$ $\rightarrow$ expand $b$ to
     $[b,b,b]$
     \vspace{0.2cm}
   * add a  scalar $b$ to a 2D [2,3] array  $a =[[a_{11},a_{12},a_{13}],[a_{21},a_{22},a_{23}]]$
     $\rightarrow$ expand $b$ to $b =[[b,b,b],[b,b,b]]$ and add element wise
     \vspace{0.2cm}
   * add 1D array $b = [b_1,b_2,b_3]$ to a 2D [2,3] array $a=[[a_{11},a_{12},a_{13}],[a_{21},a_{22},a_{23}]]$   $\rightarrow$  1D array is broadcast
     across each row of the 2D array $b =[[b_1,b_2,b_3],[b_1,b_2,b_3]]$ and added  element wise 
 \vspace{0.2cm}
   Arithmetic operations can only be performed when the shape of each
   dimension in the arrays are equal or one has the dimension size of 1. Look
   [\textcolor{violet}{here}](https://numpy.org/doc/stable/user/basics.broadcasting.html) for more details 
 \footnotesize
 ```python
     # Add a vector to each row of a matrix
     x = np.array([[1,2,3], [4,5,6]]) # x has shape (2, 3)
     v = np.array([1,2,3])            # v has shape (3,)
     x + v     # [[2 4 6]
               #  [5 7 9]]    
 ```
 \normalsize
 ## Plot data
 A popular library to present data is the `pyplot` module of `matplotlib`.
 * Drawing a function in one plot
 \footnotesize
 ::: columns
 :::: {.column width=35%}
 ```python
 import numpy as np
 import matplotlib.pyplot as plt
 # generate 100 points from 0 to 2 pi
 x = np.linspace( 0, 10*np.pi, 100 )
 f = np.sin(x)**2
 # plot function
 plt.plot(x,f,'blueviolet',label='sine')
 plt.xlabel('x [radian]')
 plt.ylabel('f(x)')
 plt.title('Plot sin^2')
 plt.legend(loc='upper right')
 plt.axis([0,30,-0.1,1.2]) # limit the plot range
 # show the plot
 plt.show()
 ```
 ::::
 :::: {.column width=40%}
 ![](figures/matplotlib_Figure_1.png)
 ::::
 :::
 \normalsize
 ##
 * Drawing a scatter plot of data 
 \footnotesize
 ::: columns
 :::: {.column width=35%}
 ```python
 ...
 # create x,y data points
 num = 75
 x = range(num)
 y = range(num) +  np.random.randint(0,num/1.5,num)
 z = - (range(num) +  np.random.randint(0,num/3,num)) + num
 # create colored scatter plot, sample 1
 plt.scatter(x, y, color = 'green',
                  label='Sample 1')
 # create colored scatter plot, sample 2
 plt.scatter(x, z, color = 'orange',
                  label='Sample 2')
 plt.title('scatter plot')
 plt.xlabel('x')
 plt.ylabel('y')
 # description and plot
 plt.legend()
 plt.show()
 ```
 ::::
 :::: {.column width=35%}
 \vspace{3cm}
 ![](figures/matplotlib_Figure_6.png)
 ::::
 :::
 \normalsize
 ##
 * Drawing a histogram of data 
 \footnotesize
 ::: columns
 :::: {.column width=35%}
 ```python
 ...
 # create normalized gaussian Distribution
 g = np.random.normal(size=10000)
 # histogram the data
 plt.hist(g,bins=40)
 # plot rotated histogram
 plt.hist(g,bins=40,orientation='horizontal')
 # normalize area to 1
 plt.hist(g,bins=40,density=True)
 # change color
 plt.hist(g,bins=40,density=True,
   edgecolor='lightgreen',color='orange')
 plt.title('Gaussian Histogram')
 plt.xlabel('bin')
 plt.ylabel('entries')
 # description and plot
 plt.legend(['Normalized distribution'])
 plt.show()
 ```
 ::::
 :::: {.column width=35%}
 \vspace{3.5cm}
 ![](figures/matplotlib_Figure_5.png)
 ::::
 :::
 \normalsize
 ##
 * Drawing subplots in one canvas
 \footnotesize
 ::: columns
 :::: {.column width=35%}
 ```python
 ...
 g = np.exp(-0.2*x)
 # create figure
 plt.figure(num=2,figsize=(10.0,7.5),dpi=150,facecolor='lightgrey')
 plt.suptitle('1 x 2 Plot')
 # create subplot and plot first one
 plt.subplot(1,2,1)
 # plot first one
 plt.title('exp(x)')
 plt.xlabel('x')
 plt.ylabel('g(x)')
 plt.plot(x,g,'blueviolet')
 # create subplot and plot second one 
 plt.subplot(1,2,2)
 plt.plot(x,f,'orange')
 plt.plot(x,f*g,'red')
 plt.legend(['sine^2','exp*sine'])
 # show the plot
 plt.show()
 ```
 ::::
 :::: {.column width=40%}
 \vspace{3cm}
 ![](figures/matplotlib_Figure_2.png)
 ::::
 :::
 \normalsize
 ## Image data 
 The `image` class of the `matplotlib` library can be used to load the image
 to numpy arrays and to render the image.
 * There are 3 common formats for the numpy array  
  * (M, N) scalar data used for greyscale images
  * (M, N, 3) for RGB images (each pixel has an array with RGB color attached) 
  * (M, N, 4) for RGBA images (each pixel has an array with RGB color
    and transparency attached)
  The method `imread` loads the image into an `ndarray`, which can be
  manipulated.
  The method `imshow` renders the image data
 \vspace {2cm}
 ##
 * Drawing pixel data and images
 \footnotesize
 ::: columns
 :::: {.column width=50%}
 ```python
 ....
 # create data array with pixel postion and RGB color code
 width, height = 200, 200
 data = np.zeros((height, width, 3), dtype=np.uint8)
 # red patch in the center
 data[75:125, 75:125] = [255, 0, 0] 
 x = np.random.randint(0,width-1,100)
 y = np.random.randint(0,height-1,100)
 data[x,y]= [0,255,0] # 100 random green pixel
 plt.imshow(data)
 plt.show()
 ....
 import matplotlib.image as mpimg
 #read image into numpy array
 pic = mpimg.imread('picture.jpg')
 mod_pic = pic[:,:,0] # grab slice 0 of the colors
 plt.imshow(mod_pic)  # use default color code also
 plt.colorbar()       # try cmap='hot' 
 plt.show()
 ```
 ::::
 :::: {.column width=25%} 
 ![](figures/matplotlib_Figure_3.png)
 \vspace{1cm}
 ![](figures/matplotlib_Figure_4.png)
 ::::
 ::: 
 \normalsize
 ## Input / output
 For the analysis of measured data efficient input \/ output plays an
 important role. In numpy, `ndarrays` can be saved and read in from files.
 `load()` and `save()` functions handle numpy binary files (.npy extension)
 which contain  data, shape, dtype and other information required to
 reconstruct the `ndarray` of the disk file.
 \footnotesize
 ```python
   r = np.random.default_rng()       # instanciate random number generator
   a = r.random((4,3))               # random 4x3 array
   np.save('myBinary.npy', a)        # write array a to binary file myBinary.npy
   b = np.arange(12)                 
   np.savez('myComp.npz', a=a, b=b)  # write a and b in compressed binary file  
   ......
   b = np.load('myBinary.npy')       # read content of myBinary.npy into b
 ```
 \normalsize
 The storage and retrieval of array data in text file format is done
 with `savetxt()` and `loadtxt()` methods. Parameter controlling delimiter,
 line separators, file header and footer can be specified.
 \footnotesize
 ```python
   x = np.array([1,2,3,4,5,6,7])                      # create ndarray 
   np.savetxt('myText.txt',x,fmt='%d', delimiter=',') # write array x to file myText.txt
                                                      # with comma separation
 ```
 \normalsize
 ## Input / output
 Import tabular data from table processing programs in office packages.
 \vspace{0.4cm}
 \footnotesize
 ::: columns
 :::: {.column width=35%}
 `Excel data` can be exported as text file (myData_01.csv) with a comma as
 delimiter.
 ::::
 :::: {.column width=35%}
 ![](figures/numpy_excel.png)
 ::::
 :::
 \footnotesize
 ```python
   .....
   # read content of all files myData_*.csv into data
   data = np.loadtxt('myData_01.csv',dtype=int,delimiter=',')
   print (data.shape)               #  (12, 9)
   print (data)                     #  [[1 1 1 1 0 0 0 0 0]
                                    #   [0 0 1 1 0 0 1 1 0]
                                    #  .....
                                    #   [0 0 0 0 1 1 1 1 1]]    
 ```
 \normalsize
 ## Input / output
 Import tabular data from table processing programs in office packages.
 \vspace{0.4cm}
 \footnotesize
 ::: columns
 :::: {.column width=35%}
 `Excel data` can be exported as text file (myData_01.csv) with a comma as
 delimiter. \newline
 $\color{blue}{Often~many~files~are~available~(myData\_*.csv)}$
 ::::
 :::: {.column width=35%}
 ![](figures/numpy_multi_excel.png)
 ::::
 :::
 \footnotesize
 ```python
   .....
   # find files and directories with names matching a pattern
   import glob              
   # read content of all files myData_*.csv into data
   file_list = sorted(glob.glob('myData_*.csv')) # generate a sorted file list
   for filename in file_list:
       data = np.loadtxt(fname=filename, dtype=int, delimiter=',')
       print(data[:,3])     # print column 3 of each file
                            # [1 1 1 1 1 1 1 1 1 1 1 0]
                            # ......
                            # [0 1 0 1 0 1 0 1 0 1 0 1]
 ```
 \normalsize
 ## Exercise 1
 i) Display a numpy array as figure of a blue cross. The size should be 200
   by 200 pixel. Use as array format (M, N, 3), where the first 2 specify
   the pixel positions and the last 3 the rbg color from 0:255.
   - Draw in addition a red square of arbitrary position into the figure.
   - Draw a circle in the center of the figure. Try to create a mask which
     selects the inner part of the circle using the indexing.
   \small
   [Solution:  01_intro_ex_1a_sol.ipynb](https://www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/solutions/01_intro_ex_1a_sol.ipynb) \normalsize
 ii) Read data which contains pixels from the binary file horse.py into a
    numpy array. Display the data and the following transformations in 4
    subplots: scaling and translation, compression in x and y, rotation
    and mirroring.
    \small
    [Solution: 01_intro_ex_1b_sol.ipynb](https://www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/solutions/01_intro_ex_1b_sol.ipynb) \normalsize 
 ## Pandas
 [\textcolor{violet}{pandas}](https://pandas.pydata.org/pandas-docs/stable/getting_started/index.html) is a software library written in python for
 \textcolor{blue}{data manipulation and analysis}. 
 \vspace{0.4cm}
 \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
 * Offers data structures and operations for manipulating numerical tables with
  integrated indexing
 * Imports data from various file formats, e.g. comma-separated values, JSON,
  SQL or Excel
 * Tools for reading and writing data structures, allows analyzing, filtering,
  spliting, grouping and aggregating, merging and joining and plotting 
 * Built on top of `NumPy`
 * Visualize the data with `matplotlib`
 * Most machine learning tools support `pandas` $\rightarrow$ 
  it is widely used to preprocess data sets for analysis and  machine learning
  in various scientific fields
 ## Pandas micro introduction
 Goal: Exploring, cleaning, transforming, and visualization of data.
 The basic indexable objects are
 \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
 * `Series` -> vector (list) of data elements of arbitrary  type
 * `DataFrame` -> tabular arangement of data elements of column wise
                 arbitrary type
   Both allow cleaning data by removing of `empty` or `nan` data entries
 \footnotesize
 ```python
     import numpy as np
     import pandas as pd                    # use together with numpy
     s = pd.Series([1, 3, 5, np.nan, 6, 8]) # create a Series of int64
     r = pd.Series(np.random.randn(4))      # Series of random numbers float64 
     dates = pd.date_range("20130101", periods=3) # index according to dates
     df = pd.DataFrame(np.random.randn(3,4),index=dates,columns=list("ABCD"))
     print (df)                             # print the DataFrame
                        A         B         C         D
          2013-01-01  1.618395  1.210263 -1.276586 -0.775545
          2013-01-02  0.676783 -0.754161 -1.148029 -0.244821
          2013-01-03 -0.359081  0.296019  1.541571  0.235337
     new_s = s.dropna() # return a new Data Frame without the column that has NaN cells	  
 ```
 \normalsize
 ##
 \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
 * pandas data can be saved in different file formats (CSV, JASON, html, XML,
  Excel, OpenDocument, HDF5 format, .....). `NaN` entries are kept
  in the output file, except if they are removed with  `dataframe.dropna()`
   * csv file
     \footnotesize
     ```python
     df.to_csv("myFile.csv")  # Write the DataFrame df to a csv file 
     ```
      \normalsize
   * HDF5 output
     \footnotesize
     ```python  
     df.to_hdf("myFile.h5",key='df',mode='w') # Write the DataFrame df to HDF5
     s.to_hdf("myFile.h5", key='s',mode='a')	  
     ```
     \normalsize
   * Writing to an excel file
     \footnotesize
     ```python  
     df.to_excel("myFile.xlsx", sheet_name="Sheet1")
     ```
     \normalsize
 * Deleting file with data in python
 \footnotesize
 ```python  
     import os
     os.remove('myFile.h5')
 ```
 \normalsize
 ##
 \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
 * read in data from various formats
   * csv file
     \footnotesize
     ```python
      .......
      df = pd.read_csv('heart.csv')  # read csv data table
      print(df.info())
         <class 'pandas.core.frame.DataFrame'>
         RangeIndex: 303 entries, 0 to 302
         Data columns (total 14 columns):
         #   Column    Non-Null Count  Dtype  
         ---  ------    --------------  -----  
         0   age       303 non-null    int64  
         1   sex       303 non-null    int64  
         2   cp        303 non-null    int64
         print(df.head(5))       # prints the first 5 rows of the data table 
         print(df.describe())    # shows a quick statistic summary of your data
     ```
 \normalsize
   * Reading an excel file
     \footnotesize
     ```python  
     df = pd.read_excel("myFile.xlsx","Sheet1", na_values=["NA"])
     ```
     \normalsize
     \textcolor{olive}{There are many options specifying details for IO.}
 ##
 \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
 * Various functions exist to select and view data from pandas objects
  * Display column and index
    \footnotesize
     ```python
     df.index                    # show datetime index of df
     DatetimeIndex(['2013-01-01','2013-01-02','2013-01-03'],
                   dtype='datetime64[ns]',freq='D')
     df.column                   # show columns info
     Index(['A', 'B', 'C', 'D'], dtype='object')
     ```
     \normalsize
  * `DataFrame.to_numpy()` gives a `NumPy` representation of the underlying data
    \footnotesize
     ```python
     df.to_numpy()       # one dtype for the entire array, not per column!
     [[-0.62660101 -0.67330526  0.23269168 -0.67403546]
     [-0.53033339  0.32872063 -0.09893568  0.44814084]
     [-0.60289996 -0.22352548 -0.43393248  0.47531456]]
     ```
     \normalsize
     Does not include the index or column labels in the output
  * more on viewing 
    \footnotesize
    ```python
    df.T                                   # transpose the DataFrame df
    df.sort_values(by="B")                 # Sorting by values of column B of df
    df.sort_index(axis=0)                  # Sorting by index ascending values
    df.sort_index(axis=0,ascending=False)  # Display columns in inverse order
    ```
    \normalsize
 ##
 \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
 * Selecting data of pandas objects $\rightarrow$ keep or reduce dimensions
  * get a named column as a Series
    \footnotesize
     ```python
     df["A"]           # selects a column A from df, simular to  df.A
     df.iloc[:, 1:2]   # slices column A explicitly from df, df.loc[:, ["A"]]
     ```
     \normalsize
  * select rows of a DataFrame 
    \footnotesize
     ```python
     df[0:2]                   # selects row 0 and 1 from df, 
     df["20130102":"20130103"] # use indices, endpoints are included!
     df.iloc[3]                # select with the position of the passed integers
     df.iloc[1:3, :]           # selects row 1 and 2 from df
     ```
     \normalsize
  * select by label
     \footnotesize
     ```python
     df.loc["20130102":"20130103",["C","D"]] # selects row 1 and 2 and only C and D
     df.loc[dates[0], "A"]                   # selects a single value (scalar)
     ```
     \normalsize
  *  select by lists of integer position (as in `NumPy`)
     \footnotesize
     ```python
     df.iloc[[0, 2], [1, 3]] # select row 1 and 3 and col B and D (data only)
     df.iloc[1, 1]           # get a value explicitly (data only, no index lines)
     ```
     \normalsize
  *  select according to expressions
     \footnotesize
     ```python
     df.query('B<C')         # select rows where B < C
     df1=df[(df["B"]==0)&(df["D"]==0)] # conditions on rows
     ```
     \normalsize
 ##
 \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
 * Selecting data of pandas objects continued
  * Boolean indexing
    \footnotesize
     ```python
     df[df["A"] > 0]           # select df where all values of column A are >0
     df[df > 0]                # select values >0 from the entire DataFrame
     ```
     \normalsize
     more complex example
     \footnotesize
     ```python
     df2 = df.copy()                     # copy df
     df2["E"] = ["eight","one","four"]   # add column E
     df2[df2["E"].isin(["two", "four"])] # test if elements "two" and  "four" are
                                         # contained in Series column E
     ```
     \normalsize
  * Operations (in general exclude missing data)
    \footnotesize
     ```python
     df2[df2 > 0] = -df2   # All elements > 0 change sign
     df.mean(0)            # get column wise mean (numbers=axis)  
     df.mean(1)            # get row wise mean
     df.std(0)             # standard deviation according to axis
     df.cumsum()           # cumulative sum of each column
     df.apply(np.sin)      # apply function to each element of df
     df.apply(lambda x: x.max() - x.min()) # apply lambda function column wise
     df + 10               # add scalar 10
     df - [1, 2, 10 , 100] # subtract values of each column
     df.corr()             # Compute pairwise correlation of columns
     ```
     \normalsize
 ##  Pandas - plotting data
 [\textcolor{violet}{Visualization}](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html) is integrated in pandas using matplotlib. Here are only 2 examples
 * Plot random data in histogramm and scatter plot
 \footnotesize
 ```python
     # create DataFrame with random normal distributed data
     df = pd.DataFrame(np.random.randn(1000,4),columns=["a","b","c","d"])
     df = df + [1, 3, 8 , 10]      # shift column wise mean by  1, 3, 8 , 10
     df.plot.hist(bins=20)         # histogram all 4 columns
     g1 = df.plot.scatter(x="a",y="c",color="DarkBlue",label="Group 1")
     df.plot.scatter(x="b",y="d",color="DarkGreen",label="Group 2",ax=g1)
 ```
 \normalsize
 ::: columns
 :::: {.column width=35%}
 ![](figures/pandas_histogramm.png)
 ::::
 :::: {.column width=35%}
 ![](figures/pandas_scatterplot.png)
 ::::
 :::
 ##  Pandas - plotting data
 The function crosstab() takes one or more array-like objects as indexes or
 columns and constructs a new DataFrame of variable counts on the inputs
 \footnotesize
 ```python
   df = pd.DataFrame(                # create DataFrame of 2 categories
      {"sex":   np.array([0,0,0,0,1,1,1,1,0,0,0]),
       "heart": np.array([1,1,1,0,1,1,1,0,0,0,1])
      }  )                           # closing bracket goes on next line
   pd.crosstab(df2.sex,df2.heart)    # create cross table of possibilities
   pd.crosstab(df2.sex,df2.heart).plot(kind="bar",color=['red','blue']) # plot counts
 ```
 \normalsize
 ::: columns
 :::: {.column width=38%}
 ![](figures/pandas_crosstabplot.png)
 ::::
 :::
 ## Exercise 2
 Read the file [\textcolor{violet}{heart.csv}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2023/ml/exercises/heart.csv) into a DataFrame.
 [\textcolor{violet}{Information on the dataset}](https://archive.ics.uci.edu/ml/datasets/heart+Disease)
 \setbeamertemplate{itemize item}{\color{red}$\square$}
  * Which columns do we have
  * Print the first 3 rows
  * Print the statistics summary and the correlations
  * Print mean values for each column with and without disease (target)
  * Select the data according to `sex` and `target` (heart disease 0=no 1=yes). 
  * Plot the `age` distribution of male and female in one histogram
  * Plot the heart disease distribution according to chest pain type `cp`
  * Plot `thalach`  according to `target` in one histogramm
  * Plot `sex` and `target` in a histogramm figure    
  * Correlate `age` and `max heart rate` according to `target` 
  * Correlate `age` and `colesterol` according to `target` 
  \small
   [Solution: 01_intro_ex_2_sol.ipynb](https://www.physi.uni-heidelberg.de/~reygers/lectures/2023/ml/solutions/01_intro_ex_2_sol.ipynb) \normalsize
Author	SHA1	Message	Date
Joerg Marks	7bbcc601b7	update files	2023-04-03 13:08:49 +02:00
Joerg Marks	36f3b3ede8	update	2023-04-03 13:04:54 +02:00