diff --git a/slides/.gitignore b/slides/.gitignore
new file mode 100644
index 0000000..e43b0f9
--- /dev/null
+++ b/slides/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/slides/Makefile b/slides/Makefile
new file mode 100644
index 0000000..e3db178
--- /dev/null
+++ b/slides/Makefile
@@ -0,0 +1,10 @@
+# make     creates pdf files of all newly edited .md files
+
+SRCS := $(wildcard *.md)
+PDF := $(SRCS:%.md=%.pdf)
+
+OPT := --pdf-engine=xelatex --variable mainfont="Helvetica" --variable sansfont="Helvetica" -t beamer -s -fmarkdown-implicit_figures --template=template.beamer --highlight-style=kate 
+all: ${PDF}
+
+%.pdf: %.md
+	pandoc $(OPT) --output=$@ $<
diff --git a/slides/README.md b/slides/README.md
new file mode 100644
index 0000000..c22dd99
--- /dev/null
+++ b/slides/README.md
@@ -0,0 +1,2 @@
+Pandoc slides example following style of [Stefan Wunsch's CERN IML workhsop presenation](https://github.com/stwunsch/iml_keras_workshop) on [keras](https://keras.io/) (see slides folder)
+
diff --git a/slides/copy_slides.sh b/slides/copy_slides.sh
new file mode 100755
index 0000000..89a426e
--- /dev/null
+++ b/slides/copy_slides.sh
@@ -0,0 +1,6 @@
+# slides (do chgrp machlearn <file> later)
+# scp CIPpoolAccess.PDF reygers@rho0:public_html/lectures/2021/ml/transparencies/
+# scp 03_ml_basics.pdf reygers@rho0:public_html/lectures/2021/ml/transparencies/
+# scp 04_decision_trees.pdf reygers@rho0:public_html/lectures/2021/ml/transparencies/
+scp 05_neural_networks.pdf reygers@rho0:public_html/lectures/2021/ml/transparencies/
+
diff --git a/slides/decision_trees.md b/slides/decision_trees.md
new file mode 100644
index 0000000..06817cd
--- /dev/null
+++ b/slides/decision_trees.md
@@ -0,0 +1,347 @@
+---
+title: |
+  | Introduction to Data Analysis and Machine Learning in Physics:  
+  | 4. Decisions Trees  
+
+author: "Martino Borsato, Jörg Marks, Klaus Reygers"
+date: "Studierendentage, 11-14 April 2022"
+---
+## Exercises
+
+* Exercise 1: Compare different decision tree classifiers
+	* [`04_decision_trees_ex_1_compare_tree_classifiers.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/04_decision_trees_ex_1_compare_tree_classifiers.ipynb)
+* Exercise 2: Apply XGBoost classifier to MAGIC data set
+	* [`04_decision_trees_ex_2_magic_xgboost_and_random_forest.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/04_decision_trees_ex_2_magic_xgboost_and_random_forest.ipynb)
+* Exercise 3: Feature importance
+* Exercise 4: Interpret a classifier with SHAP values
+
+## Decision trees
+
+\begin{figure}
+\centering
+\includegraphics[width=0.85\textwidth]{figures/mini_boone_decisions_tree.png}
+\end{figure}
+
+\begin{center}
+Leaf nodes classify events as either signal or background
+\end{center}
+
+## Decision trees: Rectangular volumes in feature space
+
+\begin{figure}
+\centering
+\includegraphics[width=0.75\textwidth]{figures/decision_trees_feature_space.png}
+\end{figure}
+
+* Easy to interpret and visualize: Space of feature vectors split up into rectangular volumes (attributed to either signal or background)
+* How to build a decision tree in an optimal way?
+
+## Finding optimal cuts
+
+Separation btw. signal and background is often measured with the Gini index (or Gini impurity):
+
+$$ G = p (1-p) $$
+
+Here $p$ is the purity:
+$$ p = \frac{\sum_\mathrm{signal} w_i}{\sum_\mathrm{signal} w_i + \sum_\mathrm{background} w_i}, \quad w_i = \text{weight of event}\; i$$
+
+\vfill
+\textcolor{gray}{Usefulness of weights will become apparent soon.}
+
+\vfill
+Improvement in signal/background separation after splitting a set A into two sets B and C:
+$$ \Delta = W_A G_A - W_B G_B - W_C G_C \quad \text{where} \quad W_X = \sum_{X} w_i $$
+
+## Gini impurity and other purity measures
+\begin{figure}
+\centering
+\includegraphics[width=0.7\textwidth]{figures/signal_purity.png}
+\end{figure}
+
+
+## Decision tree pruning
+
+::: columns
+:::: {.column width=50%}
+
+When to stop growing a tree?
+
+* When all nodes are essentially pure?
+* Well, that's overfitting!
+
+\vspace{3ex}
+
+Pruning
+
+* Cut back fully grown tree to avoid overtraining, i.e., replace nodes and subtrees by leaves
+
+::::
+:::: {.column width=50%}
+\begin{figure}
+\centering
+\includegraphics[width=0.85\textwidth]{figures/tree_pruning_slides.png}
+\end{figure}
+::::
+:::
+
+## Single decision trees: Pros and cons
+
+\textcolor{green}{Pros:}
+
+* Requires little data preparation (unlike neural networks)
+* Can use continuous and categorical inputs
+
+\vfill
+
+\textcolor{red}{Cons:}
+
+* Danger of overfitting training data
+* Sensitive to fluctuations in the training data
+* Hard to find global optimum
+* When to stop splitting?
+
+## Ensemble methods: Combine weak learners
+
+::: columns
+:::: {.column width=70%}
+* Bootstrap Aggregating (Bagging)
+	* Sample training data (with replacement) and train a separate model on each of the derived training sets
+	* Classify example with majority vote, or compute average output from each tree as model output
+
+::::
+:::: {.column width=30%}
+$$ y(\vec x) = \frac{1}{N_\mathrm{trees}} \sum_{i=1}^{N_{trees}} y_i(\vec x) $$ 
+::::
+:::
+\vfill
+::: columns
+:::: {.column width=70%}
+* Boosting
+	* Train $N$ models in sequence, giving more weight to examples not correctly classified by previous model
+	* Take weighted average to classify examples
+
+::::
+:::: {.column width=30%}
+$$ y(\vec x) = \frac{\sum_{i=1}^{N_\mathrm{trees}} \alpha_i y_i(\vec x)}{\sum_{i=1}^{N_\mathrm{trees}} \alpha_i} $$ 
+::::
+:::
+
+## Random forests
+
+* "One of the most widely used and versatile algorithms in data science and machine learning" 
+\tiny \textcolor{gray}{arXiv:1803.08823v3} \normalsize
+\vfill
+* Use bagging to select random example subset
+\vfill
+* Train a tree, but only use random subset of features at each split
+	* this reduces the correlation between different trees
+	* makes the decision more robust to missing data
+
+## Boosted decision trees: Idea
+
+\begin{figure}
+\centering
+\includegraphics[width=0.75\textwidth]{figures/bdt.png}
+\end{figure}
+
+## AdaBoost (short for Adaptive Boosting)
+
+Initial training sample
+
+\begin{center}
+\begin{tabular}{l l}
+$\vec x_1, ..., \vec x_n$: & multivariate event data \\
+$y_1, ..., y_n$: & true class labels, $+1$ or $-1$ \\
+$w_1^{(1)}, ..., w_n^{(1)}$ & event weights
+\end{tabular}
+\end{center}
+
+with equal weights normalized as
+
+$$ \sum_{i=1}^n w_i^{(1)} = 1 $$
+
+Train first classifier $f_1$:
+
+\begin{center}
+\begin{tabular}{l l}
+$f_1(\vec x_i) > 0$ & classify as signal \\
+$f_1(\vec x_i) < 0$ & classify as background
+\end{tabular}
+\end{center}
+
+## AdaBoost: Updating events weights
+
+Define training sample $k+1$ from training sample $k$ by updating weights:
+
+$$ w_i^{(k+1)} = w_i^{(k)} \frac{e^{- \alpha_k f_k(\vec x_i) y_i/2}}{Z_k} $$
+
+\footnotesize
+\textcolor{gray}{$$ i = \text{event index}, \quad Z_k:\; \text{normalization factor so that } \sum_{i=1}^n w_i^{(k)} = 1$$}
+\normalsize
+
+Weight is increased if event was misclassified by the previous classifier
+
+$\to$ "Next classifier should pay more attention to misclassified events"
+
+
+\vfill
+At each step the classifier $f_k$ minimizes error rate:
+
+$$ \varepsilon_k = \sum_{i=1}^n w_i^{(k)} I(y_i f_k( \vec x_i) \le 0), 
+\quad I(X) = 1 \; \text{if} \; X \; \text{is true, 0 otherwise}  $$
+
+## AdaBoost: Assigning the classifier score
+
+Assign score to each classifier according to its error rate:
+$$ \alpha_k = \ln \frac{1 - \varepsilon_k}{\varepsilon_k} $$
+
+\vfill
+
+Combined classifier (weighted average):
+$$ f(\vec x) = \sum_{k=1}^K \alpha_k f_k(\vec x) $$
+
+
+
+## Gradient boosting
+
+Basic idea:
+
+* Train a first decision tree
+* Then train a second one on the residual errors made by the first tree
+* And so on
+
+\vfill
+
+In slightly more detail:
+
+* \color{gray} Consider labeled training data: $\{\vec x_i, y_i\}$
+* Model prediction at iteration $m$: $F_m(\vec x_i)$
+* New model: $F_{m+1}(\vec x) = F_m(\vec x) + h_m(\vec x)$
+* Find $h_m(\vec x)$ by fitting it to 
+$\{(\vec x_1, y_1 - F_m(\vec x_1)), \; (\vec x_2, y_2 - F_m(\vec x_2)), \; ... \; (\vec x_n, y_n - F_m(\vec x_n)) \}$
+
+\color{black}
+
+## Example 1: Predict critical temperature for superconductivty (Regression with XGBoost) (1)
+\small
+[\textcolor{gray}{04\_decision\_trees\_critical\_temp\_regression.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/04_decision_trees_critical_temp_regression.ipynb)
+\normalsize
+
+\vfill
+
+Superconductivty data set: 
+
+Predict the critical temperature based on 81 material features.
+\footnotesize
+[\textcolor{gray}{https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data}](https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data)
+\normalsize
+
+\vfill
+
+From the abstract:
+
+
+We estimate a statistical model to predict the superconducting critical temperature based on the features extracted from the superconductor’s chemical formula. The statistical model gives reasonable out-of-sample predictions: ±9.5 K based on root-mean-squared-error. Features extracted based on thermal conductivity, atomic radius, valence, electron affinity, and atomic mass contribute the most to the model’s predictive accuracy.
+
+\vfill
+
+\tiny 
+[\textcolor{gray}{https://doi.org/10.1016/j.commatsci.2018.07.052}](https://doi.org/10.1016/j.commatsci.2018.07.052)
+\normalsize
+
+
+## Example 1: Predict critical temperature for superconductivty (Regression with XGBoost) (2)
+
+::: columns
+:::: {.column width=60%}
+\footnotesize
+```python
+import xgboost as xgb
+
+XGBreg = xgb.sklearn.XGBRegressor()
+
+XGBreg.fit(X_train, y_train)
+
+y_pred = XGBreg.predict(X_test)
+
+from sklearn.metrics import mean_squared_error
+rms = np.sqrt(mean_squared_error(y_test, y_pred))
+print(f"root mean square error {rms:.2f}")
+```
+
+\textcolor{gray}{This gives:}
+
+`root mean square error 9.68`
+::::
+:::: {.column width=40%}
+\vspace{6ex}
+![](figures/critical_temperature.pdf)
+::::
+:::
+
+## Exercise 1: Compare different decision tree classifiers
+
+\small
+[\textcolor{gray}{04\_decision\_trees\_ex\_1\_compare\_tree\_classifiers.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/04_decision_trees_ex_1_compare_tree_classifiers.ipynb)
+
+\vspace{5ex}
+
+Compare scikit-learns's `AdaBoostClassifier`, `RandomForestClassifier`, and `GradientBoostingClassifier` by plotting their ROC curves for the heart disease data set. \newline
+
+\vspace{2ex}
+
+Is there a classifier that clearly performs best?
+
+
+## Exercise 2: Apply XGBoost classifier to MAGIC data set
+
+\small
+[\textcolor{gray}{04\_decision\_trees\_ex\_2\_magic\_xgboost\_and\_random\_forest.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/04_decision_trees_ex_2_magic_xgboost_and_random_forest.ipynb)
+\normalsize
+
+\footnotesize
+```python
+# train XGBoost boosted decision tree
+import xgboost as xgb
+XGBclassifier = xgb.sklearn.XGBClassifier(nthread=-1, seed=1, n_estimators=1000)
+```
+\normalsize
+
+\small
+a) Plot predicted probabilities for the test sample for signal and background events (\texttt{plt.hist})
+b) Which is the most important feature for discriminating signal and background according to XGBoost? \ 
+Hint: use plot_impartance from XGBoost (see [XGBoost plotting API](https://xgboost.readthedocs.io/en/latest/python/python_api.html)). Do you get the same answer for all three performance measures provided by XGBoost (“weight”, “gain”, or “cover”)?
+c) Visualize one decision tree from the ensemble (let's say tree number 10). For this you need the the graphviz package (`pip3 install graphviz`)
+d) Compare the performance of XGBoost with the [**random forest classifier**](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) from [**scikit learn**](https://scikit-learn.org/stable/index.html). Plot signal and background efficiency for both classifiers in one plot. Which classifier performs better?
+\normalsize
+
+
+## Exercise 3: Feature importance
+
+\small
+[\textcolor{gray}{04\_decision\_trees\_ex\_3\_magic\_feature\_importance.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/04_decision_trees_ex_3_magic_feature_importance.ipynb)
+\normalsize
+
+\vspace{3ex}
+
+Evaluate the importance of each of the $n$ features in the training of the XGBoost classifier for the MAGIC data set by dropping one of the features. This gives $n$ different classifiers. Compare the performance of these classifiers using the AUC score. 
+
+
+## Exercise 4: Interpret a classifier with SHAP values
+
+SHAP (SHapley Additive exPlanations) are a means to explain the output of any machine learning model. [Shapeley values](https://en.wikipedia.org/wiki/Shapley_value) are a concept that is used in cooperative game theory. They are named after Lloyd Shapley who won the Nobel Prize in Economics in 2012.
+
+\vfill
+
+Use the Python library [`SHAP`](https://shap.readthedocs.io/en/latest/index.html) to quantify the feature importance.
+
+a) Study the documentation at [https://shap.readthedocs.io/en/latest/tabular_examples.html](https://shap.readthedocs.io/en/latest/tabular_examples.html)
+
+b) Create a summary plot of the feature importance in the MAGIC data set with `shap.summary_plot` for the XGBoost classifier of exercise 2. What are the three most important features?
+
+c) Do the same for the superconductivity data set? What are the three most important features? 
+
+
+
+
+
diff --git a/slides/figures/03_ml_basics_galton_linear_regression_iminuit.pdf b/slides/figures/03_ml_basics_galton_linear_regression_iminuit.pdf
new file mode 100644
index 0000000..b555633
Binary files /dev/null and b/slides/figures/03_ml_basics_galton_linear_regression_iminuit.pdf differ
diff --git a/slides/figures/03_ml_basics_log_regr_heart_disease.pdf b/slides/figures/03_ml_basics_log_regr_heart_disease.pdf
new file mode 100644
index 0000000..a16e1d7
Binary files /dev/null and b/slides/figures/03_ml_basics_log_regr_heart_disease.pdf differ
diff --git a/slides/figures/03_ml_basics_logistic_regression.pdf b/slides/figures/03_ml_basics_logistic_regression.pdf
new file mode 100644
index 0000000..277c8fa
Binary files /dev/null and b/slides/figures/03_ml_basics_logistic_regression.pdf differ
diff --git a/slides/figures/L1vsL2.pdf b/slides/figures/L1vsL2.pdf
new file mode 100644
index 0000000..dfdd9ae
Binary files /dev/null and b/slides/figures/L1vsL2.pdf differ
diff --git a/slides/figures/activation_functions.png b/slides/figures/activation_functions.png
new file mode 100644
index 0000000..5dec32b
Binary files /dev/null and b/slides/figures/activation_functions.png differ
diff --git a/slides/figures/adversarial_attack.png b/slides/figures/adversarial_attack.png
new file mode 100644
index 0000000..49bd32f
Binary files /dev/null and b/slides/figures/adversarial_attack.png differ
diff --git a/slides/figures/ai_history.png b/slides/figures/ai_history.png
new file mode 100644
index 0000000..2febc77
Binary files /dev/null and b/slides/figures/ai_history.png differ
diff --git a/slides/figures/ai_ml_dl.pdf b/slides/figures/ai_ml_dl.pdf
new file mode 100644
index 0000000..03fd6d0
Binary files /dev/null and b/slides/figures/ai_ml_dl.pdf differ
diff --git a/slides/figures/ann.png b/slides/figures/ann.png
new file mode 100644
index 0000000..af3d34d
Binary files /dev/null and b/slides/figures/ann.png differ
diff --git a/slides/figures/anomaly_detection.png b/slides/figures/anomaly_detection.png
new file mode 100644
index 0000000..d66fbb8
Binary files /dev/null and b/slides/figures/anomaly_detection.png differ
diff --git a/slides/figures/autoencoder_example.pdf b/slides/figures/autoencoder_example.pdf
new file mode 100644
index 0000000..f4b2407
Binary files /dev/null and b/slides/figures/autoencoder_example.pdf differ
diff --git a/slides/figures/bdt.png b/slides/figures/bdt.png
new file mode 100644
index 0000000..37e22e8
Binary files /dev/null and b/slides/figures/bdt.png differ
diff --git a/slides/figures/book-murphy.png b/slides/figures/book-murphy.png
new file mode 100644
index 0000000..63f8f61
Binary files /dev/null and b/slides/figures/book-murphy.png differ
diff --git a/slides/figures/book_deep_learning_for_physics_research.png b/slides/figures/book_deep_learning_for_physics_research.png
new file mode 100644
index 0000000..c1e706e
Binary files /dev/null and b/slides/figures/book_deep_learning_for_physics_research.png differ
diff --git a/slides/figures/boston_house_prices.pdf b/slides/figures/boston_house_prices.pdf
new file mode 100644
index 0000000..68ad798
Binary files /dev/null and b/slides/figures/boston_house_prices.pdf differ
diff --git a/slides/figures/cnn.png b/slides/figures/cnn.png
new file mode 100644
index 0000000..e717e2c
Binary files /dev/null and b/slides/figures/cnn.png differ
diff --git a/slides/figures/cnn_conv_layer.png b/slides/figures/cnn_conv_layer.png
new file mode 100644
index 0000000..b50382f
Binary files /dev/null and b/slides/figures/cnn_conv_layer.png differ
diff --git a/slides/figures/cnn_fully_connected.png b/slides/figures/cnn_fully_connected.png
new file mode 100644
index 0000000..ec306f2
Binary files /dev/null and b/slides/figures/cnn_fully_connected.png differ
diff --git a/slides/figures/cnn_pooling.png b/slides/figures/cnn_pooling.png
new file mode 100644
index 0000000..7aa1ae4
Binary files /dev/null and b/slides/figures/cnn_pooling.png differ
diff --git a/slides/figures/cnn_sliding_filter.png b/slides/figures/cnn_sliding_filter.png
new file mode 100644
index 0000000..72855b9
Binary files /dev/null and b/slides/figures/cnn_sliding_filter.png differ
diff --git a/slides/figures/critical_temperature.pdf b/slides/figures/critical_temperature.pdf
new file mode 100644
index 0000000..b4c4a3b
Binary files /dev/null and b/slides/figures/critical_temperature.pdf differ
diff --git a/slides/figures/cross_val.png b/slides/figures/cross_val.png
new file mode 100644
index 0000000..d1c81b9
Binary files /dev/null and b/slides/figures/cross_val.png differ
diff --git a/slides/figures/decision_boundaries.png b/slides/figures/decision_boundaries.png
new file mode 100644
index 0000000..25f2501
Binary files /dev/null and b/slides/figures/decision_boundaries.png differ
diff --git a/slides/figures/decision_trees_feature_space.png b/slides/figures/decision_trees_feature_space.png
new file mode 100644
index 0000000..4002331
Binary files /dev/null and b/slides/figures/decision_trees_feature_space.png differ
diff --git a/slides/figures/deep_learning_book.png b/slides/figures/deep_learning_book.png
new file mode 100644
index 0000000..cc9dcd8
Binary files /dev/null and b/slides/figures/deep_learning_book.png differ
diff --git a/slides/figures/deep_learning_with_python.png b/slides/figures/deep_learning_with_python.png
new file mode 100644
index 0000000..dc8aa2c
Binary files /dev/null and b/slides/figures/deep_learning_with_python.png differ
diff --git a/slides/figures/deepl.png b/slides/figures/deepl.png
new file mode 100644
index 0000000..bf9e88f
Binary files /dev/null and b/slides/figures/deepl.png differ
diff --git a/slides/figures/dnn.png b/slides/figures/dnn.png
new file mode 100644
index 0000000..67abadc
Binary files /dev/null and b/slides/figures/dnn.png differ
diff --git a/slides/figures/dropout.png b/slides/figures/dropout.png
new file mode 100644
index 0000000..fca7610
Binary files /dev/null and b/slides/figures/dropout.png differ
diff --git a/slides/figures/example_overtraining.png b/slides/figures/example_overtraining.png
new file mode 100644
index 0000000..baf0a91
Binary files /dev/null and b/slides/figures/example_overtraining.png differ
diff --git a/slides/figures/feature_transformation.png b/slides/figures/feature_transformation.png
new file mode 100644
index 0000000..edbb7a5
Binary files /dev/null and b/slides/figures/feature_transformation.png differ
diff --git a/slides/figures/fisher.png b/slides/figures/fisher.png
new file mode 100644
index 0000000..e5a41e5
Binary files /dev/null and b/slides/figures/fisher.png differ
diff --git a/slides/figures/fisher_linear_decision_boundary.png b/slides/figures/fisher_linear_decision_boundary.png
new file mode 100644
index 0000000..0a527de
Binary files /dev/null and b/slides/figures/fisher_linear_decision_boundary.png differ
diff --git a/slides/figures/gan.png b/slides/figures/gan.png
new file mode 100644
index 0000000..643c410
Binary files /dev/null and b/slides/figures/gan.png differ
diff --git a/slides/figures/gradient_descent.png b/slides/figures/gradient_descent.png
new file mode 100644
index 0000000..93eaad4
Binary files /dev/null and b/slides/figures/gradient_descent.png differ
diff --git a/slides/figures/gradient_descent_cmp.png b/slides/figures/gradient_descent_cmp.png
new file mode 100644
index 0000000..4ca0271
Binary files /dev/null and b/slides/figures/gradient_descent_cmp.png differ
diff --git a/slides/figures/hands_on_machine_learning.png b/slides/figures/hands_on_machine_learning.png
new file mode 100644
index 0000000..db9af4d
Binary files /dev/null and b/slides/figures/hands_on_machine_learning.png differ
diff --git a/slides/figures/handwritten_digits.png b/slides/figures/handwritten_digits.png
new file mode 100644
index 0000000..bbcc538
Binary files /dev/null and b/slides/figures/handwritten_digits.png differ
diff --git a/slides/figures/heart_table.png b/slides/figures/heart_table.png
new file mode 100644
index 0000000..bdc59ce
Binary files /dev/null and b/slides/figures/heart_table.png differ
diff --git a/slides/figures/imagenet.png b/slides/figures/imagenet.png
new file mode 100644
index 0000000..697061e
Binary files /dev/null and b/slides/figures/imagenet.png differ
diff --git a/slides/figures/imagenet_challenge.png b/slides/figures/imagenet_challenge.png
new file mode 100644
index 0000000..8b90c92
Binary files /dev/null and b/slides/figures/imagenet_challenge.png differ
diff --git a/slides/figures/iminuit_minos_scan-1.png b/slides/figures/iminuit_minos_scan-1.png
new file mode 100644
index 0000000..58f5a85
Binary files /dev/null and b/slides/figures/iminuit_minos_scan-1.png differ
diff --git a/slides/figures/iminuit_minos_scan-2.png b/slides/figures/iminuit_minos_scan-2.png
new file mode 100644
index 0000000..0584938
Binary files /dev/null and b/slides/figures/iminuit_minos_scan-2.png differ
diff --git a/slides/figures/iris_dataset.png b/slides/figures/iris_dataset.png
new file mode 100644
index 0000000..cf79956
Binary files /dev/null and b/slides/figures/iris_dataset.png differ
diff --git a/slides/figures/keras.png b/slides/figures/keras.png
new file mode 100644
index 0000000..723ca74
Binary files /dev/null and b/slides/figures/keras.png differ
diff --git a/slides/figures/knn.png b/slides/figures/knn.png
new file mode 100644
index 0000000..fcbad16
Binary files /dev/null and b/slides/figures/knn.png differ
diff --git a/slides/figures/logistic_fct.png b/slides/figures/logistic_fct.png
new file mode 100644
index 0000000..6c0bb80
Binary files /dev/null and b/slides/figures/logistic_fct.png differ
diff --git a/slides/figures/loss_fct.png b/slides/figures/loss_fct.png
new file mode 100644
index 0000000..5f6b621
Binary files /dev/null and b/slides/figures/loss_fct.png differ
diff --git a/slides/figures/magic_photo.png b/slides/figures/magic_photo.png
new file mode 100644
index 0000000..8ef520d
Binary files /dev/null and b/slides/figures/magic_photo.png differ
diff --git a/slides/figures/magic_photo_small.png b/slides/figures/magic_photo_small.png
new file mode 100644
index 0000000..fd84ce3
Binary files /dev/null and b/slides/figures/magic_photo_small.png differ
diff --git a/slides/figures/magic_shower_em_had.png b/slides/figures/magic_shower_em_had.png
new file mode 100644
index 0000000..92edd27
Binary files /dev/null and b/slides/figures/magic_shower_em_had.png differ
diff --git a/slides/figures/magic_shower_em_had_small.png b/slides/figures/magic_shower_em_had_small.png
new file mode 100644
index 0000000..ee51248
Binary files /dev/null and b/slides/figures/magic_shower_em_had_small.png differ
diff --git a/slides/figures/magic_shower_parameters.png b/slides/figures/magic_shower_parameters.png
new file mode 100644
index 0000000..4a9871e
Binary files /dev/null and b/slides/figures/magic_shower_parameters.png differ
diff --git a/slides/figures/magic_sketch.png b/slides/figures/magic_sketch.png
new file mode 100644
index 0000000..9f66e62
Binary files /dev/null and b/slides/figures/magic_sketch.png differ
diff --git a/slides/figures/matplotlib_Figure_1.png b/slides/figures/matplotlib_Figure_1.png
new file mode 100644
index 0000000..b4129f1
Binary files /dev/null and b/slides/figures/matplotlib_Figure_1.png differ
diff --git a/slides/figures/matplotlib_Figure_2.png b/slides/figures/matplotlib_Figure_2.png
new file mode 100644
index 0000000..e013010
Binary files /dev/null and b/slides/figures/matplotlib_Figure_2.png differ
diff --git a/slides/figures/matplotlib_Figure_3.png b/slides/figures/matplotlib_Figure_3.png
new file mode 100644
index 0000000..b819274
Binary files /dev/null and b/slides/figures/matplotlib_Figure_3.png differ
diff --git a/slides/figures/matplotlib_Figure_4.png b/slides/figures/matplotlib_Figure_4.png
new file mode 100644
index 0000000..7e70aff
Binary files /dev/null and b/slides/figures/matplotlib_Figure_4.png differ
diff --git a/slides/figures/mini_boone_decisions_tree.png b/slides/figures/mini_boone_decisions_tree.png
new file mode 100644
index 0000000..55f0961
Binary files /dev/null and b/slides/figures/mini_boone_decisions_tree.png differ
diff --git a/slides/figures/ml_example_spam.png b/slides/figures/ml_example_spam.png
new file mode 100644
index 0000000..f3ad26e
Binary files /dev/null and b/slides/figures/ml_example_spam.png differ
diff --git a/slides/figures/mlp.png b/slides/figures/mlp.png
new file mode 100644
index 0000000..fc791a2
Binary files /dev/null and b/slides/figures/mlp.png differ
diff --git a/slides/figures/mnist.png b/slides/figures/mnist.png
new file mode 100644
index 0000000..14a7161
Binary files /dev/null and b/slides/figures/mnist.png differ
diff --git a/slides/figures/monitoring_overtraining.png b/slides/figures/monitoring_overtraining.png
new file mode 100644
index 0000000..bdc9a0b
Binary files /dev/null and b/slides/figures/monitoring_overtraining.png differ
diff --git a/slides/figures/mva.png b/slides/figures/mva.png
new file mode 100644
index 0000000..578d268
Binary files /dev/null and b/slides/figures/mva.png differ
diff --git a/slides/figures/mva_nn.png b/slides/figures/mva_nn.png
new file mode 100644
index 0000000..8c7077d
Binary files /dev/null and b/slides/figures/mva_nn.png differ
diff --git a/slides/figures/neuron.png b/slides/figures/neuron.png
new file mode 100644
index 0000000..d8dea7b
Binary files /dev/null and b/slides/figures/neuron.png differ
diff --git a/slides/figures/nn_decision_boundary.png b/slides/figures/nn_decision_boundary.png
new file mode 100644
index 0000000..4e0745d
Binary files /dev/null and b/slides/figures/nn_decision_boundary.png differ
diff --git a/slides/figures/pandas_crosstabplot.png b/slides/figures/pandas_crosstabplot.png
new file mode 100644
index 0000000..fea9408
Binary files /dev/null and b/slides/figures/pandas_crosstabplot.png differ
diff --git a/slides/figures/pandas_histogramm.png b/slides/figures/pandas_histogramm.png
new file mode 100644
index 0000000..ecec461
Binary files /dev/null and b/slides/figures/pandas_histogramm.png differ
diff --git a/slides/figures/pandas_scatterplot.png b/slides/figures/pandas_scatterplot.png
new file mode 100644
index 0000000..d546f8a
Binary files /dev/null and b/slides/figures/pandas_scatterplot.png differ
diff --git a/slides/figures/pdf_from_2d_histogram.png b/slides/figures/pdf_from_2d_histogram.png
new file mode 100644
index 0000000..736e93b
Binary files /dev/null and b/slides/figures/pdf_from_2d_histogram.png differ
diff --git a/slides/figures/perceptron_photo.png b/slides/figures/perceptron_photo.png
new file mode 100644
index 0000000..badacb5
Binary files /dev/null and b/slides/figures/perceptron_photo.png differ
diff --git a/slides/figures/perceptron_retina.png b/slides/figures/perceptron_retina.png
new file mode 100644
index 0000000..4b512ce
Binary files /dev/null and b/slides/figures/perceptron_retina.png differ
diff --git a/slides/figures/perceptron_weighted_sum.png b/slides/figures/perceptron_weighted_sum.png
new file mode 100644
index 0000000..2f6a49f
Binary files /dev/null and b/slides/figures/perceptron_weighted_sum.png differ
diff --git a/slides/figures/perceptron_with_threshold.png b/slides/figures/perceptron_with_threshold.png
new file mode 100644
index 0000000..0ee3a97
Binary files /dev/null and b/slides/figures/perceptron_with_threshold.png differ
diff --git a/slides/figures/regularization.png b/slides/figures/regularization.png
new file mode 100644
index 0000000..90e0c3b
Binary files /dev/null and b/slides/figures/regularization.png differ
diff --git a/slides/figures/relu.png b/slides/figures/relu.png
new file mode 100644
index 0000000..317d730
Binary files /dev/null and b/slides/figures/relu.png differ
diff --git a/slides/figures/rootOptions.png b/slides/figures/rootOptions.png
new file mode 100644
index 0000000..21f984a
Binary files /dev/null and b/slides/figures/rootOptions.png differ
diff --git a/slides/figures/scikit-learn.png b/slides/figures/scikit-learn.png
new file mode 100644
index 0000000..ca0a9a5
Binary files /dev/null and b/slides/figures/scikit-learn.png differ
diff --git a/slides/figures/sigmoid.png b/slides/figures/sigmoid.png
new file mode 100644
index 0000000..32dcfdb
Binary files /dev/null and b/slides/figures/sigmoid.png differ
diff --git a/slides/figures/signal_background_distr.png b/slides/figures/signal_background_distr.png
new file mode 100644
index 0000000..f73007a
Binary files /dev/null and b/slides/figures/signal_background_distr.png differ
diff --git a/slides/figures/signal_purity.png b/slides/figures/signal_purity.png
new file mode 100644
index 0000000..0cffdc9
Binary files /dev/null and b/slides/figures/signal_purity.png differ
diff --git a/slides/figures/stochastic_gradient_descent.png b/slides/figures/stochastic_gradient_descent.png
new file mode 100644
index 0000000..48a07a9
Binary files /dev/null and b/slides/figures/stochastic_gradient_descent.png differ
diff --git a/slides/figures/supervised_learning_car_plane.png b/slides/figures/supervised_learning_car_plane.png
new file mode 100644
index 0000000..e9a1842
Binary files /dev/null and b/slides/figures/supervised_learning_car_plane.png differ
diff --git a/slides/figures/supervised_nutshell.png b/slides/figures/supervised_nutshell.png
new file mode 100644
index 0000000..7ece776
Binary files /dev/null and b/slides/figures/supervised_nutshell.png differ
diff --git a/slides/figures/tensorflow.png b/slides/figures/tensorflow.png
new file mode 100644
index 0000000..85fdf6f
Binary files /dev/null and b/slides/figures/tensorflow.png differ
diff --git a/slides/figures/tf_playground.png b/slides/figures/tf_playground.png
new file mode 100644
index 0000000..725efd7
Binary files /dev/null and b/slides/figures/tf_playground.png differ
diff --git a/slides/figures/tree_pruning_slides.png b/slides/figures/tree_pruning_slides.png
new file mode 100644
index 0000000..671a64e
Binary files /dev/null and b/slides/figures/tree_pruning_slides.png differ
diff --git a/slides/figures/underfitting_overfitting.pdf b/slides/figures/underfitting_overfitting.pdf
new file mode 100644
index 0000000..89e4197
Binary files /dev/null and b/slides/figures/underfitting_overfitting.pdf differ
diff --git a/slides/figures/underfitting_overfitting_001.png b/slides/figures/underfitting_overfitting_001.png
new file mode 100644
index 0000000..e3a5221
Binary files /dev/null and b/slides/figures/underfitting_overfitting_001.png differ
diff --git a/slides/figures/videogame.png b/slides/figures/videogame.png
new file mode 100644
index 0000000..ee2a654
Binary files /dev/null and b/slides/figures/videogame.png differ
diff --git a/slides/figures/xor.png b/slides/figures/xor.png
new file mode 100644
index 0000000..9d872c2
Binary files /dev/null and b/slides/figures/xor.png differ
diff --git a/slides/figures/xor_like_data.pdf b/slides/figures/xor_like_data.pdf
new file mode 100644
index 0000000..c312cd7
Binary files /dev/null and b/slides/figures/xor_like_data.pdf differ
diff --git a/slides/fit_intro.md b/slides/fit_intro.md
new file mode 100644
index 0000000..6904f88
--- /dev/null
+++ b/slides/fit_intro.md
@@ -0,0 +1,563 @@
+---
+title: |
+  | Introduction to Data Analysis and Machine Learning in Physics:  
+  | 2. Data modeling and fitting  
+
+author: "Martino Borsato, Jörg Marks, Klaus Reygers"
+date: "Studierendentage, 11-14 April 2022"
+---
+
+## Data modeling and fitting  - introduction
+
+Data analysis is a process of understanding and modeling measured
+data. The goal is to find patterns and to obtain inferences allowing to
+observe underlying patterns.
+
+ * There are 2 approaches to statistical data modeling
+   * Hypothesis testing: is our data compatible with a certain model?
+   * Determination of model parameter: use the data to determine the parameters
+     of a (theoretical) model
+
+ * For the determination of model parameter 
+   * Analysis of data distributions $\rightarrow$ mean, variance,
+     median, FWHM, .... \newline
+     allows for an approximate determination of model parameter
+
+   * Data fitting with the least square method $\rightarrow$ an iterative
+     process which minimizes the deviation of a model decribed by parameters
+     from data. This determines the optimal values and uncertainties
+     of the parameters.
+
+   * Maximum likelihood fitting $\rightarrow$ find a set of model parameters
+     which most likely describe the data by maximizing the probability
+     distributions.
+
+The parameter determination by minimization is an integral part of machine
+learning approaches, here a system learns patterns and predicts
+related ones. This is the focus in the upcoming days.
+
+## Data modeling and fitting  - introduction
+
+Data analysis is a process of understanding and modeling measured
+data. The goal is to find patterns and to obtain inferences allowing to
+observe underlying patterns.
+
+ * There are 2 approaches to statistical data modeling
+   * Hypothesis testing: is our data compatible with a certain model?
+   * Determination of model parameter: use the data to determine the parameters
+     of a (theoretical) model
+
+ * For the determination of model parameter 
+   * Analysis of data distributions $\rightarrow$ mean, variance,
+     median, FWHM, .... \newline
+     allows for an approximate determination of model parameter
+     
+    \setbeamertemplate{itemize subitem}{\color{red}\tiny$\blacksquare$}
+   * \textcolor{blue}{Data fitting with the least square method
+     $\rightarrow$ an iterative
+     process which minimizes the deviation of a model decribed by parameters
+     from data. This determines the optimal values and uncertainties
+     of the parameters.}
+     
+    \setbeamertemplate{itemize subitem}{\color{blue}\tiny$\blacktriangleright$} 
+   * Maximum likelihood fitting $\rightarrow$ find a set of model parameters
+     which most likely describe the data by maximizing the probability
+     distributions.
+
+The parameter determination by minimization is an integral part of machine
+learning approaches, here a system learns patterns and predicts
+related ones. This is the focus in the upcoming days.
+
+
+
+## Least Square (LS) Method (1)
+
+The method determines the \textcolor{blue}{optimal parameters of functions
+     to gaussian distributed measurements}.
+
+Lets consider a sample of $n$ measurements $y_{i}$ and a parametrized
+description of the measurement $\eta_{i} = f(x_{i} | \theta)$ 
+with a parameter set $\theta = \theta_{1}, \theta_{2} ,.... \theta_{k}$,
+dependent values $x_{i}$ and measurement errors $\sigma_{i}$.
+
+The parameter set should be determined such that
+\begin{equation*}
+ \color{blue}{S = \sum \limits_{i=1}^{n} \frac{(y_i-\eta_i)^2}{\sigma_i^2}  = \sum \limits_{i=1}^{n} \frac{(y_i- f(x_i|\theta))^2}{\sigma_i^2}    \longrightarrow \, minimal }
+\end{equation*}
+In case of correlated measurements the covariance matrix of the $y_{i}$ has to
+be taken into account. This is accomplished by defining a weight matrix from
+the covariance matrix of the input data. A decorrelation of the input data
+should be considered.
+\vspace{0.2cm}
+ 
+$S$ follows a $\chi^{2}$-distribution with $(n-k)$ degrees of freedom.
+
+## Least Square (LS) Method (2)
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+* Example LS-method
+   \vspace{0.2cm}
+   
+  Often the fit function $f(x, \theta)$ is linear in
+  $\theta = \theta_{1}, \theta_{2} ,.... \theta_{k}$
+  \vspace{0.2cm}
+
+  $f(x | \theta) = \theta_{1} f_{1}(x) + .... + \theta_{k} f_{k}(x)$
+  \vspace{0.2cm}
+
+  If the model is a straight line and our parameters are $\theta_{1}$ and
+  $\theta_{2}$ $(f_{1}(x) = 1,$  $f_{2}(x) = x)$ we have
+  $f(x | \theta) =  \theta_{1} + \theta_{2} x$
+  \vspace{0.2cm}
+
+  The LS equation is
+  \vspace{0.2cm}
+  
+  $\color{blue}{S = \sum \limits_{i=1}^{n} \frac{(y_i-\eta_i)^2}{\sigma_i^2} } \color{black} {= \sum
+  \limits_{i=1}^{n}  \frac{(y_{i} -  \theta_{1} -  x_{i}
+  \theta_{2})^2}{\sigma_i^2 }}$   \hspace{0.4cm} and with
+  \vspace{0.2cm}
+
+  $\frac{\partial S}{\partial \theta_1} =   \sum\limits_{i=1}^{n} \frac{-2
+  (y_i - \theta_1 -  x_i \theta_2)}{\sigma_i^2} = 0$  \hspace{0.4cm}  and  \hspace{0.4cm} 
+   $\frac{\partial S}{\partial \theta_2} =   \sum\limits_{i=1}^{n} \frac{-2 x_i (y_i - \theta_1 -  x_i \theta_2)}{\sigma_i^2} = 0$
+   \vspace{0.2cm}
+   
+   the parameters $\theta_{1}$ and $\theta_{2}$ can be determined.
+
+   \vspace{0.2cm}
+   \textcolor{olive}{In case of linear fit functions solutions can be found by matrix inversion}
+
+   \vfill
+
+## Least Square (LS) Method (3)
+
+   \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+   
+* Use of a nonlinear fit function $f(x, \theta)$ like \hspace{0.4cm}
+  $f(x | \theta) =  \theta_{1} \cdot e^{-\theta_{2} x}$
+  \vspace{0.2cm}
+
+  results in the LS equation 
+  \vspace{0.2cm}
+
+  $\color{blue}{S = \sum \limits_{i=1}^{n} \frac{(y_i-\eta_i)^2}{\sigma_i^2} } \color{black} {= \sum \limits_{i=1}^{n}  \frac{(y_{i} - \theta_{1} \cdot  e^{-\theta_{2} x_{i}})^2}{\sigma_i^2 }}$   \hspace{0.4cm} 
+  \vspace{0.2cm}
+
+  which we have to minimize
+  \vspace{0.2cm}
+ 
+  $\frac{\partial S}{\partial \theta_1} =   \sum\limits_{i=1}^{n} \frac{ 2 e^{-2 \theta_2 x_i} ( \theta_1 - y_i e^{\theta_2 x_i} )} {\sigma_i^2 } = 0$   \hspace{0.4cm}  and  \hspace{0.4cm}
+  $\frac{\partial S}{\partial \theta_2} =   \sum\limits_{i=1}^{n} \frac{ 2  \theta_1 x_I e^{-2 \theta_2 x_i} (y_i e^{\theta_2 x_i} - \theta_1)} {\sigma_i^2 } = 0$
+
+  \vspace{0.4cm}
+ 
+  In a nonlinear system, the LS Ansatz leads to derivatives which are
+  functions of the independent variable and the parameters $\color{red}\rightarrow$ \textcolor{olive}{no closed solutions}
+  \vspace{0.4cm}
+  
+  In general, we have gradient equations which don't have closed solutions.
+  There are a couple of methods including approximations which allow together
+  with numerical methods to find a global minimum, Gauss–Newton algorithm,
+  Levenberg–Marquardt algorithm,  gradient descend methods and also direct
+  search methods.
+
+## Minuit - a programm package for minimization (1)
+
+In general data fitting and also solving machine learning algorithms lead
+to a minimization problem of functions. In the
+1975-1980 F. James (CERN) developed
+a FORTRAN-based package, [\textcolor{violet}{MINUIT}](http://seal.web.cern.ch/seal/documents/minuit/mntutorial.pdf), which is a framework to handle
+multiparameter minimization and compute the best-fit parameter values and
+uncertainties, including correlations between the parameters.
+\vspace{0.2cm}
+  
+The user provides a minimization function
+$F(X,P)$ with the parameter space $P=(p_1,....p_k)$ and
+variable space $X$ (also multi-dimensional). There is an interface via
+functions which influences the
+the minimization process. MINUIT provides
+[\textcolor{violet}{error calculations}](http://seal.web.cern.ch/seal/documents/minuit/mnerror.pdf) including correlations for the parameter space by evaluating the shape of the function in some neighbourhood of the minimum.
+\vspace{0.2cm}
+
+The package
+has now a new object-oriented implementation as [\textcolor{violet}{Minuit2 library}](https://root.cern.ch/doc/master/Minuit2Page.html) , written
+in C++.
+\vspace{0.2cm}
+ 
+During the minimization $F(X,P)$ is evaluated for various $X$. For the
+choice of $P=(p_1,....p_k)$ different methods are used 
+
+## Minuit - a programm package for minimization (2)
+
+\vspace{0.4cm}
+\textcolor{olive}{SEEK}: Search for the minimum with Monte Carlo methods, mostly used at the start
+  of the minimization with unknown starting values. It is not a converging
+  algorithm.
+  \vspace{0.2cm}
+
+\textcolor{olive}{SIMPLX}:
+  Uses the simplex method of Nelder and Mead. Function values are compared
+  in the parameter space. Via step size control the minimum is approached.
+  Parameter errors are only approximate, no covariance matrix is calculated.
+\vspace{0.2cm}
+
+<!---
+A simplex is the smallest n dimensional figure with n+1 corners. By reflecting
+one point in the hyperplane of the other point and adopts itself to the
+function plane.
+-->
+
+\textcolor{olive}{MIGRAD}:
+  Uses an algorithm of R. Fletcher, which takes the function and the gradient
+  to  approach the minimum with a variable metric method. An error matrix and
+  correlation coefficients are available
+ \vspace{0.2cm}
+
+\textcolor{olive}{HESSE}:
+  Calculates the hessian matrix of second derivatives and determines the
+  covariance matrix.
+ \vspace{0.2cm}
+ 
+\textcolor{olive}{MINOS}:
+  Calculates (asymmetric) errors using likelihood profiles.
+  The algorithm for finding the positive and negative MINOS errors for parameter
+  $n$ consists of varying $n$ each time minimizing $F(X,P)$ with respect to
+  all the others.
+   \vspace{0.2cm}
+
+## Minuit - a programm package for minimization (3)
+
+\vspace{0.4cm}
+
+Fit process with the minuit package
+\vspace{0.2cm}
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+* The individual steps decribed above can be called several times and in different order during the minimization process.
+
+* Each of the parameters $p_i$ of  $P=(p_1,....p_k)$ can be set constant and
+  released during the minimization steps.
+  
+* Problems are expected in models with strong correlation between
+  parameters $\rightarrow$ change model to uncorrelated definitions
+
+* Local minima, edges/steps or undefined ranges in $F(X,P)$ are problematic
+  $\rightarrow$ simplify your model
+
+ \vspace{3cm}
+
+
+## Minuit2 - The iminuit package
+
+\vspace{0.4cm}
+
+ [\textcolor{violet}{iminuit}](https://iminuit.readthedocs.io/en/stable/)  is
+ a Jupyter-friendly Python interface for the Minuit2 C++ library.
+\vspace{0.2cm}
+
+ \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+* The class `iminuit.Minuit` instanciates the minuit object. The minimizer
+   function is given as argument. Basic steering of the fit
+   like setting start parameters, error definition and print level is also
+   done here.
+   
+\footnotesize
+```python
+     from iminuit import Minuit
+     def fcn(x, y, z):                    # definition of the minimizer function
+         return (x - 2) ** 2 + (y - x) ** 2 + (z - 4) ** 2
+     m = Minuit(fcn, x=0, y=0, z=0, errordef=1 , print_level=1)       
+```
+\normalsize
+
+ * Several methods determine the interaction with the fitting process, calls
+   to `migrad` , `hesse` or  printing of parameters and errors
+   
+\footnotesize
+```python
+     ......
+     m.migrad()                     # run optimiser
+     print(m.values , m.errors)     # print results
+     m.hesse()                      # run covariance estimator
+```
+\normalsize
+
+## Minuit2 - iminuit example
+
+\vspace{0.2cm}
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+ * The function `fcn` describes the model with parameters to be determined by
+   data.`fcn` is minimal when the model parameters agree best with data.
+   `fcn` has positional arguments, one for each fit parameter. `iminuit`
+   example fit:
+   
+   [\textcolor{violet}{02\_fit\_exp\_fit\_iMinuit.py}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_exp_fit_iMinuit.py)
+   
+\footnotesize
+```python
+     ......
+     x =  np.array([....],dtype='d') # measurements x
+     y =  np.array([....],dtype='d') # measurements y
+     dy = np.array([....],dtype='d') # error in y
+     def xp(a, b , c):
+         return a * np.exp(b*x) + c
+     # least-squares function = sum of data residuals squared
+     def fcn(a,b,c):
+        return np.sum((y - xp(a,b,c)) ** 2 / dy ** 2)
+     # limit the range of b and fix parameter c
+     m = Minuit(fcn,a=1,b=-0.7,c=1,limit_b=(-1,0.1),fix_c=True)
+     m.migrad()                      # run minimizer
+     m.fixed["c"] = False            # release  parameter c
+     m.migrad()                      # rerun minimizer
+```
+\normalsize
+
+ * Might be useful to fix parameters or limit the range for some applications
+
+## Minuit2 - iminuit (3)
+
+\vspace{0.2cm}
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+* Results and control information of the fit can be printed and accessed
+  in the the prorgamm.
+  
+\footnotesize
+```python
+     ......
+     m = Minuit(fcn,....,print_level=1) # set flag in the initializer
+     m.migrad()                         # run minimizer
+     a_fit = m.values['a']              # get parameter value a
+     a_fit_error =  m.errors['a']       # get parameter error of a
+     print (m.values,m.errors)          # print results
+ ```
+\normalsize      
+
+* After processing Hesse, covariance and correlation information of the
+   fit is available
+
+\footnotesize
+```python
+     ......
+     m.hesse()                           # run covariance estimator
+     m.matrix()                          # get covariance matrix
+     m.matrix(correlation=True)          # get full correlation matrix
+     cov = m.np_matrix()                 # save matrix to numpy
+     cor = m.np_matrix(correlation=True) 
+     print(cor[0, 1])      # print correlation between parameter 1 and 2
+ ```
+\normalsize      
+
+## Minuit2 - iminuit (4)
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+ * Minos provides asymmetric uncertainty intervals and parameter contours by
+   scanning one parameter and minimizing the function with respect to all other
+   parameters for each scan point. Results are displayed with `matplotlib`.
+
+\footnotesize
+```python
+     ......
+     m.minos()
+     print (m.get_merrors()['a'])
+     m.draw_mnprofile('b')
+     m.draw_mncontour('a', 'b', nsigma=4)
+```
+::: columns
+:::: {.column width=40%}
+![](figures/iminuit_minos_scan-1.png)
+::::
+:::: {.column width=40%}
+![](figures/iminuit_minos_scan-2.png)
+::::
+:::
+
+## Exercise 3
+
+Plot the following data with mathplotlib as in the iminuit example:
+
+ \footnotesize
+```
+   x:   0.2,0.4,0.6,0.8,1.,1.2,1.4,1.6,1.8,2.,2.2,2.4,2.6,2.8,3.,3.2,
+        3.4,3.6, 3.8,4.
+   y:   0.04,0.021,0.035,0.03,0.029,0.019,0.024,0.018,0.019,0.022,0.02,
+        0.025,0.018,0.024,0.019,0.021,0.03,0.019,0.03,0.024
+   dy:  1.792,1.695,1.541,1.514,1.427,1.399,1.388,1.270,1.262,1.228,1.189,
+        1.182,1.121,1.129,1.124,1.089,1.092,1.084,1.058,1.057
+```
+\normalsize
+ \setbeamertemplate{itemize item}{\color{red}$\square$}
+
+*  Exchange in the example iminuit fit `02_fit_exp_fit_iMinuit.ipynb` the
+   exponential function by a 3rd order polynomial and perform the fit
+
+*  Compare the correlation of the parameters of the exponential and
+   the polynomial fit
+
+*  What defines the fit quality, give an estimate
+   
+ \small
+  Solution: [\textcolor{violet}{02\_fit\_ex\_3\_sol.py}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/02_fit_ex_3_sol.py) \normalsize
+
+## Exercise 4
+
+Plot the following data with mathplotlib:
+
+ \footnotesize
+```
+   x:   1, 2, 3, 4, 5, 6, 7, 8, 9, 10
+   dx:  0.1,0.1,0.5,0.1,0.5,0.1,0.5,0.1,0.5,0.1
+   y:   1.1,2.3,2.7,3.2,3.1,2.4,1.7,1.5,1.5,1.7
+   dy:  0.15,0.22,0.29,0.39,0.31,0.21,0.13,0.15,0.19,0.13
+```
+\normalsize
+ \setbeamertemplate{itemize item}{\color{red}$\square$}
+
+  * Perform a fit with iminuit. Which model do you use?
+
+  * Plot the resulting fit function in the graph with the data
+
+  * Print the covariance matrix.  Can we improve the errors.
+
+  * Can you draw a contour plot of 2 of the fit parameters.
+  
+  \small
+   Solution: [\textcolor{violet}{02\_fit\_ex\_4\_sol.py}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/02_fit_ex_4_sol.py) \normalsize
+
+
+## PyROOT 
+
+[\textcolor{violet}{PyROOT}](https://root.cern/manual/python/) is the python binding for the C++ data analysis toolkit [\textcolor{violet}{ROOT}](https://root.cern/) developed with and for the LHC community. You can access the full
+ROOT functionality from Python while
+benefiting from the performance of the ROOT C++ libraries. The PyROOT bindings
+are automatic and dynamic and are able to interoperate with widely-used Python
+data-science libraries as `NumPy`, `pandas`, SciPy `scikit-learn` and `tensorflow`.
+
+* ROOT/PyROOT can be installed easily within anaconda3 (ROOT version 6.22.02
+  or later ) or is available in the
+  [\textcolor{violet}{CIP jupyter2 Hub}](https://jupyter2.kip.uni-heidelberg.de/)
+
+* Tools for statistical analysis, a math library with optimized algorithms,
+  multivariate analysis, visualization and simulation of data.
+
+* Storing data including objects and classes with compression in files is a
+  very powerfull aspect for any data analysis project 
+
+* Within PyROOT Minuit2 can be accessed easily either with predefined functions
+  or your own function definition
+
+* For advanced statistical analyses and data modeling  likelihood fitting with
+  the packages **rooFit** and **rooStats** is available.
+
+
+## 
+
+* Example reading the invariant mass measurements of a $D^0$ from a text file
+  and determine $\mu$ and $\sigma$  \hspace{1.0cm}  \small
+  [\textcolor{violet}{02\_fit\_histFit.py}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_histFit.py)
+  \normalsize  
+  
+\footnotesize
+```python
+     import numpy as np
+     import math
+     from ROOT import TCanvas, TFile, TH1D, TF1, TMinuit, TFitResult
+     data = np.genfromtxt('D0Mass.txt', dtype='d') # read data from text file
+     c = TCanvas('c','D0 Mass',200,10,700,500)     # instanciate output canvas
+     d0 = TH1D('d0','D0 Mass',200,1700.,2000.)     # instanciate histogramm
+     for x in data :                               # fill data into histogramm d0
+          d0.Fill(x)
+     def pyf_tf1_params(x, p):                     # define fit function
+          return p[0] * math.exp (-0.5 * ((x[0] - p[1])**2 / p[2]**2))
+     func = TF1("func",pyf_tf1_params,1840.,1880.,3)
+     # func = TF1("func",'gaus',1840.,1880.)  # use predefined function   
+     func.SetParameters(500.,1860.,5.5)    # set start parameters
+     myfit = d0.Fit(func,"S")              # fit function to the histogramm data
+     print ("Fit results: mean=",myfit.Parameter(0)," +/- ",myfit.ParError(0))
+     c.Draw()                                      # draw canvas
+     myfile = TFile('myOutFile.root','RECREATE')   # Open a ROOT file for output
+     c.Write()                                     # Write canvas
+     d0.Write()                                    # Write histogram
+     myfile.Close()                                # close file
+```
+\normalsize
+
+
+## 
+
+* Fit Options 
+\vspace{0.1cm}
+
+::: columns
+:::: {.column width=2%}
+::::
+:::: {.column width=98%}
+![](figures/rootOptions.png)
+::::
+:::
+
+## Exercise 5
+
+ Read text file [\textcolor{violet}{FitTestData.txt}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/exercises/FitTestData.txt) and draw a histogramm using PyROOT.
+ \setbeamertemplate{itemize item}{\color{red}$\square$}
+
+* Determine the mean and sigma of the signal distribution. Which function do
+  you use for fitting?
+
+* The option S fills the result object.
+
+* Try to improve the errors of the fit values with minos using the option E
+  and also try the option M to scan for a new minimum, option V provides more
+  output.
+
+* Fit the background outside the signal region use the option R+ to add the
+  function to your fit
+
+  \small
+   Solution: [\textcolor{violet}{02\_fit\_ex\_5\_sol.py}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/02_fit_ex_5_sol.py) \normalsize
+   
+
+## iPython Examples for Fitting
+
+ The different python packages are used in
+ \textcolor{blue}{example iPython notebooks}
+ to demonstrate the fitting of a third order polynomial to the same data
+ available as numpy arrays.
+
+ \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+  * LSQ fit of a polynomial to data using Minuit2 with
+  \textcolor{blue}{iminuit} and \textcolor{blue}{matplotlib} plot:
+
+    \small
+    [\textcolor{violet}{02\_fit\_iminuitFit.ipynb}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_iminuitFit.ipynb)
+    \normalsize
+
+  * Graph fitting with \textcolor{blue}{pyROOT} with options using a python
+    function including confidence level plot:
+
+    \small
+    [\textcolor{violet}{02\_fit\_fitGraph.ipynb}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_fitGraph.ipynb)
+    \normalsize
+
+  * Graph fitting with  \textcolor{blue}{numpy} and confidence level
+    plotting with  \textcolor{blue}{matplotlib}:
+
+    \small
+    [\textcolor{violet}{02\_fit\_numpyFit.ipynb}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_numpyFit.ipynb)   
+    \normalsize
+
+  * Graph fitting with a polynomial fit of  \textcolor{blue}{scikit-learn} and
+    plotting with  \textcolor{blue}{matplotlib}:
+    
+    \normalsize
+    \small
+    [\textcolor{violet}{02\_fit\_scikitFit.ipynb}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_scikitFit.ipynb)
+    \normalsize
diff --git a/slides/intro_python.md b/slides/intro_python.md
new file mode 100644
index 0000000..c214823
--- /dev/null
+++ b/slides/intro_python.md
@@ -0,0 +1,830 @@
+---
+title: |
+  | Introduction to Data Analysis and Machine Learning in Physics:  
+  | 1. Introduction to python  
+
+author: "Martino Borsato, Jörg Marks, Klaus Reygers"
+date: "Studierendentage, 11-14 April 2022"
+---
+
+## Outline of the $1^{st}$ day
+
+* Technical instructions for your interactions with the CIP pool, like
+   * using the jupyter hub
+   * using python locally in your own linux environment (anaconda)
+   * access the CIP pool from your own windows or linux system
+   * transfer data from and to the CIP pool
+
+  Can be found in [\textcolor{violet}{CIPpoolAccess.PDF}](https://www.physi.uni-heidelberg.de/~marks/root_einfuehrung/Folien/CIPpoolAccess.pdf)\normalsize
+
+* Summary of NumPy
+
+* Plotting with matplotlib
+
+* Input / output of data
+
+* Summary of pandas
+
+* Fitting with iminuit and pyROOT
+
+
+## A glimpse into python classes
+
+ The following python classes are important to data analysis and machine
+ learning will be used during the course
+ 
+ * [\textcolor{violet}{NumPy}](https://numpy.org/doc/stable/user/basics.html) - python library adding support for large,
+   multi-dimensional arrays and matrices, along with high-level
+   mathematical functions to operate on these arrays
+
+ * [\textcolor{violet}{matplotlib}](https://matplotlib.org/stable/tutorials/index.html) - a python plotting library
+
+ * [\textcolor{violet}{SciPy}](https://docs.scipy.org/doc/scipy/reference/tutorial/index.html) - extension of NumPy by a collection of
+   mathematical algorithms for minimization, regression, 
+   fourier transformation, linear algebra and image processing
+
+ * [\textcolor{violet}{iminuit}](https://iminuit.readthedocs.io/en/stable/) -
+   python wrapper to the data fitting toolkit
+   [\textcolor{violet}{Minuit2}](https://root.cern.ch/doc/master/Minuit2Page.html)
+   developed at CERN by F. James in the 1970ies 
+
+ * [\textcolor{violet}{pyROOT}](https://root.cern/manual/python/) - python wrapper to the C++ data analysis toolkit
+   ROOT used at the LHC
+
+ * [\textcolor{violet}{scikit-learn}](https://scikit-learn.org/stable/) - machine learning library written in
+   python, which makes use extensively of NumPy for high-performance
+   linear algebra algorithms
+   
+## NumPy
+ 
+   \textcolor{blue}{NumPy} (Numerical Python) is an open source Python library,
+   which contains multidimensional array and matrix data structures and methods
+   to efficiently operate on these. The core object is
+   a homogeneous n-dimensional array object,  \textcolor{blue}{ndarray}, which
+   allows for a wide variety of \textcolor{blue}{fast operations and mathematical calculations
+   with arrays and matrices} due to the extensive usage of compiled code.  
+
+   * It is heavily used in numerous scientific python packages
+
+   * `ndarray` 's  have a fixed size at creation $\rightarrow$ changing size
+     leads to recreation
+
+   * Array elements are all required to be of the same data type
+
+   * Facilitates advanced mathematical operations on large datasets
+
+   * See for a summary, e.g. &nbsp;&nbsp;  
+ \small
+[\textcolor{violet}{https://cs231n.github.io/python-numpy-tutorial/\#numpy}](https://cs231n.github.io/python-numpy-tutorial/#numpy) \normalsize
+
+\vfill
+ 
+::: columns
+:::: {.column width=30%}
+
+::::
+:::
+
+::: columns
+:::: {.column width=35%}
+
+`c = []`
+
+`for i in range(len(a)):`
+
+&nbsp;&nbsp;&nbsp; `c.append(a[i]*b[i])`
+
+::::
+
+:::: {.column width=35%}
+
+with NumPy
+
+`c = a * b`
+
+::::
+:::
+
+<!---
+It seem we need to indent by hand.
+I don't manage to align under the bullet text
+If we do it with column the vertical space is with code sections not good
+If we do it without code section the vertical space is ok, but there is no
+code high lightning.
+See the different versions of the same page in the following
+-->
+
+## NumPy - array basics
+
+* numpy arrays build a grid of \textcolor{blue}{same type} values, which are indexed.
+  The *rank* is the dimension of the array.
+  There are methods to create  and preset arrays.
+
+\footnotesize
+
+```python
+	 myA = np.array([2, 5 , 11])             # create rank 1 array (vector like)
+	 type(myA)                               # <class ‘numpy.ndarray’>
+	 myA.shape                               # (3,)
+	 print(myA[2])                           # 11   access 3. element
+	 myA[0] = 12                             # set 1. element to 12
+	 myB = np.array([[1,5],[7,9]])           # create  rank 2 array
+	 myB.shape                               # (2,2)
+	 print(myB[0,0],myB[0,1],myB[1,1])       # 1 5 9
+	 myC = np.arange(6)                      # create rank 1 set to 0 - 5
+	 myC.reshape(2,3)                        # change rank to (2,3)
+
+	 zero = np.zeros((2,5))                  # 2 rows, 5 columns, set to 0
+	 one = np.ones((2,2))                    # 2 rows, 2 columns, set to 1
+	 five = np.full((2,2), 5)                # 2 rows, 2 columns, set to 5
+	 e = np.eye(2)                           # create 2x2 identity matrix
+```
+\normalsize
+
+
+##  NumPy - array indexing (1)
+
+* select slices of a numpy array
+
+\footnotesize
+```python
+     a = np.array([[1,2,3,4],
+                   [5,6,7,8],                # 3 rows 4 columns array
+                   [9,10,11,12]])
+     b = a[:2, 1:3]                          # subarray of 2 rows and
+         array([[2, 3],                      # column 1 and 2
+                [6, 7]])
+```		    
+\normalsize
+
+* a slice of an array points into the same data, *modifying* changes the original array!
+	 
+\footnotesize
+```python
+     b[0, 0] = 77	                         # b[0,0] and a[0,1] are 77
+
+     r1_row = a[1, :]                        # get 2nd row ->  rank 1
+     r1_row.shape	                         # (4,)
+     r2_row = a[1:2, :]                      # get 2nd row -> rank 2
+     r2_row.shape                            # (1,4)
+     a=np.array([[1,2],[3,4],[5,6]])         # set a , 3 rows 2 cols
+     d=a[[0, 1, 2], [0, 1, 1]]               # d contains [1 4 6]
+     e=a[[1, 2], [1, 1]]                     # e contains [4 6]
+     np.array([a[0,0],a[1,1],a[2,0]])        # address elements explicitly
+```
+\normalsize
+
+
+##  NumPy - array indexing (2)
+
+ 
+* integer array indexing by setting an array of indices $\rightarrow$ selecting/changing elements
+
+\footnotesize
+```python
+     a = np.array([[1,2,3,4],
+                   [5,6,7,8],                # 3 rows 4 columns array
+                   [9,10,11,12]])
+     p_a = np.array([0,2,0])                 # Create an array of indices
+     s = a[np.arange(3), p_a]                # number the rows, p_a points to cols
+     print (s)                               # s contains [1 7 9]
+     a[np.arange(3),p_a] += 10               # add 10 to corresponding elements
+     x=np.array([[8,2],[7,4]])               # create 2x2 array
+     bool = (x > 5)                          # bool : array of boolians
+                                             #   [[True False]
+                                             #    [True False]]
+     print(x[x>5])                           # select elements, prints [8 7]
+```		    
+\normalsize
+
+* data type in numpy - create according to input numbers or set explicitly
+
+\footnotesize
+
+```python
+     x = np.array([1.1, 2.1])                # create float array 
+     print(x.dtype)                          # print  float64
+     y=np.array([1.1,2.9],dtype=np.int64)    # create float array [1 2]
+```
+\normalsize
+
+
+## NumPy - functions
+
+* math functions operate elementwise either as operator overload or as methods
+
+\footnotesize
+```python
+     x=np.array([[1,2],[3,4]],dtype=np.float64)    # define 2x2 float array
+     y=np.array([[3,1],[5,1]],dtype=np.float64)    # define 2x2 float array
+     s = x + y                                     # elementwise sum 
+     s = np.add(x,y)
+     s = np.subtract(x,y)
+     s = np.multiply(x,y)                          # no matrix multiplication!
+     s = np.divide(x,y)
+     s = np.sqrt(x), np.exp(x), ...
+     x @ y , or np.dot(x, y)                       # matrix product
+     np.sum(x, axis=0)                             # sum of each column
+     np.sum(x, axis=1)                             # sum of each row
+     xT = x.T                                      # transpose of x
+     x = np.linspace(0,2*pi,100)                   # get equal spaced points in x
+
+     r = np.random.default_rng(seed=42)            # constructor random number class
+     b = r.random((2,3))                           # random 2x3 matrix
+```
+\normalsize
+
+
+
+##
+
+*  broadcasting in  numpy
+  \vspace{0.4cm}
+  
+   The term broadcasting describes how numpy treats arrays with different
+   shapes during arithmetic operations
+
+   * add a scalar $b$ to a 1D array $a = [a_1,a_2,a_3]$ $\rightarrow$ expand $b$ to
+     $[b,b,b]$
+     \vspace{0.2cm}
+
+   * add a  scalar $b$ to a 2D [2,3] array  $a =[[a_{11},a_{12},a_{13}],[a_{21},a_{22},a_{23}]]$
+     $\rightarrow$ expand $b$ to $b =[[b,b,b],[b,b,b]]$ and add element wise
+     \vspace{0.2cm}
+
+   * add 1D array $b = [b_1,b_2,b_3]$ to a 2D [2,3] array $a=[[a_{11},a_{12},a_{13}],[a_{21},a_{22},a_{23}]]$   $\rightarrow$  1D array is broadcast
+     across each row of the 2D array $b =[[b_1,b_2,b_3],[b_1,b_2,b_3]]$ and added  element wise 
+ \vspace{0.2cm}
+
+   Arithmetic operations can only be performed when the shape of each
+   dimension in the arrays are equal or one has the dimension size of 1. Look
+   [\textcolor{violet}{here}](https://numpy.org/doc/stable/user/basics.broadcasting.html) for more details 
+
+\footnotesize
+```python
+     # Add a vector to each row of a matrix
+     x = np.array([[1,2,3], [4,5,6]]) # x has shape (2, 3)
+     v = np.array([1,2,3])            # v has shape (3,)
+     x + v     # [[2 4 6]
+               #  [5 7 9]]    
+```
+\normalsize
+
+## Plot data
+
+A popular library to present data is the `pyplot` module of `matplotlib`.
+
+* Drawing a function in one plot
+
+\footnotesize
+::: columns
+:::: {.column width=35%}
+```python
+import numpy as np
+import matplotlib.pyplot as plt
+# generate 100 points from 0 to 2 pi
+x = np.linspace( 0, 10*np.pi, 100 )
+f = np.sin(x)**2
+# plot function
+plt.plot(x,f,'blueviolet',label='sine')
+plt.xlabel('x [radian]')
+plt.ylabel('f(x)')
+plt.title('Plot sin^2')
+plt.legend(loc='upper right')
+plt.axis([0,30,-0.1,1.2]) # limit the plot range
+
+# show the plot
+plt.show()
+```
+::::
+:::: {.column width=40%}
+![](figures/matplotlib_Figure_1.png)
+::::
+:::
+
+\normalsize
+
+##
+* Drawing subplots in one canvas
+
+\footnotesize
+::: columns
+:::: {.column width=35%}
+```python
+...
+g = np.exp(-0.2*x)
+# create figure
+plt.figure(num=2,figsize=(10.0,7.5),dpi=150,facecolor='lightgrey')
+plt.suptitle('1 x 2 Plot')
+# create subplot and plot first one
+plt.subplot(1,2,1)
+# plot first one
+plt.title('exp(x)')
+plt.xlabel('x')
+plt.ylabel('g(x)')
+plt.plot(x,g,'blueviolet')
+# create subplot and plot second one 
+plt.subplot(1,2,2)
+plt.plot(x,f,'orange')
+plt.plot(x,f*g,'red')
+plt.legend(['sine^2','exp*sine'])
+# show the plot
+plt.show()
+```
+::::
+:::: {.column width=40%}
+\vspace{3cm}
+![](figures/matplotlib_Figure_2.png)
+::::
+:::
+\normalsize
+
+## Image data 
+
+The `image` class of the `matplotlib` library can be used to load the image
+to numpy arrays and to render the image.
+
+* There are 3 common formats for the numpy array  
+
+  * (M, N) scalar data used for greyscale images
+
+  * (M, N, 3) for RGB images (each pixel has an array with RGB color attached) 
+
+  * (M, N, 4) for RGBA images (each pixel has an array with RGB color
+    and transparency attached)
+
+
+  The method `imread` loads the image into an `ndarray`, which can be
+  manipulated.
+
+  The method `imshow` renders the image data
+
+ \vspace {2cm}
+ 
+##
+* Drawing pixel data and images
+
+\footnotesize
+::: columns
+:::: {.column width=50%}
+
+```python
+....
+# create data array with pixel postion and RGB color code
+width, height = 400, 400
+data = np.zeros((height, width, 3), dtype=np.uint8)
+# red patch in the center
+data[175:225, 175:225] = [255, 0, 0] 
+x = np.random.randint(0,width-1,100)
+y = np.random.randint(0,height-1,100)
+data[x,y]= [0,255,0] # random green pixel
+plt.imshow(data)
+plt.show()
+....
+import matplotlib.image as mpimg
+#read image into numpy array
+pic = mpimg.imread('picture.jpg')
+mod_pic = pic[:,:,0] # grab slice 0 of the colors
+plt.imshow(mod_pic)  # use default color code also
+plt.colorbar()       # try cmap='hot' 
+plt.show()
+```
+::::
+:::: {.column width=25%} 
+![](figures/matplotlib_Figure_3.png)
+\vspace{1cm}
+![](figures/matplotlib_Figure_4.png)
+::::
+::: 
+\normalsize
+
+
+## Input / output
+
+For the analysis of measured data efficient input \/ output plays an
+important role. In numpy, `ndarrays` can be saved and read in from files.
+`load()` and `save()` functions handle numpy binary files (.npy extension)
+which contain  data, shape, dtype and other information required to
+reconstruct the `ndarray` of the disk file.
+
+\footnotesize
+```python
+   r = np.random.default_rng()       # instanciate random number generator
+   a = r.random((4,3))               # random 4x3 array
+   np.save('myBinary.npy', a)        # write array a to binary file myBinary.npy
+   b = np.arange(12)                 
+   np.savez('myComp.npz', a=a, b=b)  # write a and b in compressed binary file  
+   ......
+   b = np.load('myBinary.npy')       # read content of myBinary.npy into b
+```
+\normalsize
+   
+The storage and retrieval of array data in text file format is done
+with `savetxt()` and `loadtxt()` methods. Parameter controling delimiter,
+line separators, file header and footer can be specified.
+
+\footnotesize
+```python
+   x = np.array([1,2,3,4,5,6,7])       # create ndarray 
+   np.savetxt('myText.txt',x,fmt='%d') # write array x to text file myText.txt
+   .....
+   y = np.loadtxt('myText.txt',dtype=int)  # read content of myText.txt in y
+```
+\normalsize
+
+
+## Exercise 1
+
+i) Display a numpy array as figure of a blue cross. The size should be 200
+   by 200 pixel. Use as array format (M, N, 3), where the first 2 specify
+   the pixel positions and the last 3 the rbg color from 0:255.
+   - Draw in addition a red square of arbitrary position into the figure.
+   - Draw a circle in the center of the figure. Try to create a mask which
+     selects the inner part of the circle using the indexing.
+     
+   \small
+   [Solution:  01_intro_ex_1a_sol.py](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/01_intro_ex_1a_sol.py) \normalsize
+   
+ii) Read data which contains pixels from the binary file horse.py into a
+    numpy array. Display the data and the following transformations in 4
+    subplots: scaling and translation, compression in x and y, rotation
+    and mirroring.
+    
+    \small
+    [Solution: 01_intro_ex_1b_sol.py](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/01_intro_ex_1b_sol.py) \normalsize 
+      
+
+## Pandas
+
+[\textcolor{violet}{pandas}](https://pandas.pydata.org/pandas-docs/stable/getting_started/index.html) is a software library written in Python for
+\textcolor{blue}{data manipulation and analysis}. 
+
+ \vspace{0.4cm}
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+* Offers data structures and operations for manipulating numerical tables with
+  integrated indexing
+  
+* Imports data from various file formats, e.g. comma-separated values, JSON,
+  SQL or Excel
+
+* Tools for reading and writing data structures, allows analyzing, filtering,
+  spliting, merging and joining 
+
+* Built on top of `NumPy`
+
+* Visualize the data with `matplotlib`
+
+* Most machine learning tools support `pandas` $\rightarrow$ 
+  it is widely used to preprocess data sets for machine learning
+  
+## Pandas micro introduction
+
+Goal: Exploring, cleaning, transforming, and visualization of data.
+The basic indexable objects are
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+* `Series` -> vector (list) of data elements of arbitrary  type
+ 
+* `DataFrame` -> tabular arangement of data elements of column wise
+                 arbitrary type
+		 
+   Both allow cleaning data by removing of `empty` or `nan` data entries
+		 
+\footnotesize
+```python
+     import numpy as np
+     import pandas as pd                    # use together with numpy
+     s = pd.Series([1, 3, 5, np.nan, 6, 8]) # create a Series of float64
+     r = pd.Series(np.random.randn(4))      # Series of random numbers float64 
+     dates = pd.date_range("20130101", periods=3) # index according to dates
+     df = pd.DataFrame(np.random.randn(3,4),index=dates,columns=list("ABCD"))
+     print (df)                             # print the DataFrame
+                        A         B         C         D
+          2013-01-01  1.618395  1.210263 -1.276586 -0.775545
+          2013-01-02  0.676783 -0.754161 -1.148029 -0.244821
+          2013-01-03 -0.359081  0.296019  1.541571  0.235337
+
+     new_s = s.dropna() # return a new Data Frame with no empty cells	  
+```
+\normalsize
+
+##
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+* pandas data can be saved in different file formats (CSV, JASON, html, XML,
+  Excel, OpenDocument, HDF5 format, .....). `NaN` entries are kept
+  in the output file.
+
+   * csv file
+     \footnotesize
+     ```python
+     df.to_csv("myFile.csv")  # Write the DataFrame df to a csv file 
+     ```
+      \normalsize
+
+   * HDF5 output
+  
+     \footnotesize
+     ```python  
+     df.to_hdf("myFile.h5",key='df',mode='w') # Write the DataFrame df to HDF5
+     s.to_hdf("myFile.h5", key='s',mode='a')	  
+     ```
+     \normalsize
+
+   * Writing to an excel file
+  
+     \footnotesize
+     ```python  
+     df.to_excel("myFile.xlsx", sheet_name="Sheet1")
+     ```
+     \normalsize
+
+* Deleting file with data in python
+  
+\footnotesize
+```python  
+     import os
+     os.remove('myFile.h5')
+```
+\normalsize
+
+##
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+* read in data from various formats
+ 
+   * csv file
+
+     \footnotesize
+
+     ```python
+      .......
+      df = pd.read_csv('heart.csv')  # read csv data table
+      print(df.info())
+         <class 'pandas.core.frame.DataFrame'>
+         RangeIndex: 303 entries, 0 to 302
+         Data columns (total 14 columns):
+         #   Column    Non-Null Count  Dtype  
+         ---  ------    --------------  -----  
+         0   age       303 non-null    int64  
+         1   sex       303 non-null    int64  
+         2   cp        303 non-null    int64
+         print(df.head(5))       # prints the first 5 rows of the data table 
+         print(df.describe())    # shows a quick statistic summary of your data
+     ```
+\normalsize
+
+   * Reading an excel file
+
+     \footnotesize
+     ```python  
+     df = pd.read_excel("myFile.xlsx","Sheet1", na_values=["NA"])
+     ```
+     \normalsize
+
+     \textcolor{olive}{There are many options specifying details for IO.}
+
+##
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+* Various functions exist to select and view data from pandas objects
+
+  * Display column and index
+
+    \footnotesize
+
+     ```python
+     df.index                    # show datetime index of df
+     DatetimeIndex(['2013-01-01','2013-01-02','2013-01-03'],
+                   dtype='datetime64[ns]',freq='D')
+     df.column                   # show columns info
+     Index(['A', 'B', 'C', 'D'], dtype='object')
+     ```
+     \normalsize
+     
+  * `DataFrame.to_numpy()` gives a `NumPy` representation of the underlying data
+
+    \footnotesize
+
+     ```python
+     df.to_numpy()       # one dtype for the entire array, not per column!
+     [[-0.62660101 -0.67330526  0.23269168 -0.67403546]
+     [-0.53033339  0.32872063 -0.09893568  0.44814084]
+     [-0.60289996 -0.22352548 -0.43393248  0.47531456]]
+     ```
+     \normalsize
+     
+     Does not include the index or column labels in the output
+     
+  * more on viewing 
+
+    \footnotesize
+
+    ```python
+    df.T                                   # transpose the DataFrame df
+    df.sort_values(by="B")                 # Sorting by values of a column of df
+    df.sort_index(axis=0,ascending=False)  # Sorting by index descending values
+    df.sort_index(axis=0,ascending=False)  # Display columns in inverse order
+			
+    ```
+    \normalsize
+    
+##
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+* Selecting data of pandas objects $\rightarrow$ keep or reduce dimensions
+
+  * get a named column as a Series
+
+    \footnotesize
+
+     ```python
+     df["A"]           # selects a column A from df, simular to  df.A
+     df.iloc[:, 1:2]   # slices column A explicitly from df, df.loc[:, ["A"]]
+     ```
+     \normalsize
+
+  * select rows of a DataFrame 
+
+    \footnotesize
+
+     ```python
+     df[0:2]                   # selects row 0 and 1 from df, 
+     df["20130102":"20130103"] # use indices endpoint are included!
+     df.iloc[3]                # Select with the position of the passed integers
+     df.iloc[1:3, :]           # selects row 1 and 2 from df
+     ```
+     \normalsize
+
+  * select by label
+
+     \footnotesize
+
+     ```python
+     df.loc["20130102":"20130103",["C","D"]] # selects row 1 and 2 and only C and D
+     df.loc[dates[0], "A"]                   # selects a single value (scalar)
+     ```
+     \normalsize
+
+  *  select by lists of integer position (as in `NumPy`)
+
+     \footnotesize
+
+     ```python
+     df.iloc[[0, 2], [1, 3]] # select row 1 and 3 and col B and D
+     df.iloc[1, 1]           # get a value explicitly
+
+     ```
+     \normalsize
+
+  *  select according to expressions
+
+     \footnotesize
+
+     ```python
+     df.query('B<C')         # select rows where B < C
+     df1=df[(df["B"]==0)&(df["D"]==0)] # conditions on rows
+     ```
+     \normalsize
+
+##
+
+
+\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$}
+
+* Selecting data of pandas objects continued
+
+  * Boolean indexing
+
+    \footnotesize
+
+     ```python
+     df[df["A"] > 0]           # select df where all values of column A are >0
+     df[df > 0]                # select values from the entire DataFrame
+     ```
+     \normalsize
+
+     more complex example
+     
+     \footnotesize
+     
+     ```python
+     df2 = df.copy()                     # copy df
+     df2["E"] = ["eight","one","four"]   # add column E
+     df2[df2["E"].isin(["two", "four"])] # test if elements "two" and  "four" are
+                                         # contained in Series column E
+     ```
+     \normalsize
+
+  * Operations (in general exclude missing data)
+
+    \footnotesize
+
+     ```python
+     df2[df2 > 0] = -df2   # All elements > 0 change sign
+     df.mean(0)            # get column wise mean (numbers=axis)  
+     df.mean(1)            # get row wise mean
+     df.std(0)             # standard deviation according to axis
+     df.cumsum()           # cumulative sum of each column
+     df.apply(np.sin)      # apply function to each element of df
+     df.apply(lambda x: x.max() - x.min()) # apply lambda function column wise
+     df + 10               # add scalar 10
+     df - [1, 2, 10 , 100] # subtract values of each column
+     df.corr()             # Compute pairwise correlation of columns
+     ```
+     \normalsize
+
+
+##  Pandas - plotting data
+
+[\textcolor{violet}{Visualization}](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html) is integrated in pandas using mathplotlib. Here are only 2 examples
+
+* Plot random data in histogramm and scatter plot
+
+\footnotesize
+```python
+     # create DataFrame with random normal distributed data
+     df = pd.DataFrame(np.random.randn(1000,4),columns=["a","b","c","d"])
+     df = df + [1, 3, 8 , 10]  # shift mean to  1, 3, 8 , 10
+     plt.figure()
+     df.plot.hist(bins=20)     # histogram all 4 columns
+     g1 = df.plot.scatter(x="a",y="c",color="DarkBlue",label="Group 1")
+     df.plot.scatter(x="b",y="d",color="DarkGreen",label="Group 2",ax=g1)
+```
+\normalsize
+   
+::: columns
+:::: {.column width=35%}
+![](figures/pandas_histogramm.png)
+::::
+:::: {.column width=35%}
+![](figures/pandas_scatterplot.png)
+::::
+:::
+
+##  Pandas - plotting data
+
+The function crosstab() takes one or more array-like objects as indexes or
+columns and constructs a new DataFrame of variable counts on the inputs
+
+\footnotesize
+```python
+   df = pd.DataFrame(           # create DataFrame of 2 categories
+      {"sex":   np.array([0,0,0,0,1,1,1,1,0,0,0]),
+       "heart": np.array([1,1,1,0,1,1,1,0,0,0,1])
+      }  )                      # closing bracket goes on next line
+   pd.crosstab(df2.sex,df2.heart)    # create cross table of possibilities
+   pd.crosstab(df2.sex,df2.heart).plot(kind="bar",color=['red','blue']) # plot counts
+```
+\normalsize
+::: columns
+:::: {.column width=42%}
+![](figures/pandas_crosstabplot.png)
+::::
+:::
+
+## Exercise 2
+
+Read the file [\textcolor{violet}{heart.csv}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/exercises/heart.csv) into a DataFrame.
+[\textcolor{violet}{Information on the dataset}](https://archive.ics.uci.edu/ml/datasets/heart+Disease)
+
+\setbeamertemplate{itemize item}{\color{red}$\square$}
+
+  * Which columns do we have
+
+  * Print the first 3 rows
+
+  * Print the statistics summary and the correlations
+
+  * Print mean values for each column with and without disease
+
+  * Select the data according to `sex` and `target` (heart disease 0=no 1=yes). 
+
+  * Plot the `age` distribution of male and female in one histogram
+
+  * Plot the heart disease distribution according to chest pain type `cp`
+  
+  * Plot `thalach`  according to `target` in one histogramm
+
+  * Plot `sex` and `target` in a histogramm figure    
+
+  * Correlate `age` and `max heart rate` according to `target` 
+
+  * Correlate `age` and `colesterol` according to `target` 
+
+  \small
+   [Solution: 01_intro_ex_2_sol.py](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/01_intro_ex_2_sol.py) \normalsize
+
+
+
+
+
+
+
diff --git a/slides/ml_basics.md b/slides/ml_basics.md
new file mode 100644
index 0000000..d7bef87
--- /dev/null
+++ b/slides/ml_basics.md
@@ -0,0 +1,1157 @@
+---
+title: |
+  | Introduction to Data Analysis and Machine Learning in Physics:  
+  | 3. Machine Learning Basics  
+
+author: "Martino Borsato, Jörg Marks, Klaus Reygers"
+date: "Studierendentage, 11-14 April 2022"
+---
+
+## Exercises
+
+* Exercise 1: Air shower classification (MAGIC telescope)
+	* Logistic regression
+	* [`03_ml_basics_ex01_magic.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/03_ml_basics_ex_1_magic.ipynb)
+* Exercise 2: Hand-written digit recognition with logistic regression
+	* Logistic regression 		
+	* [`03_ml_basics_ex02_mnist_softmax_regression.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/03_ml_basics_ex_2_mnist_softmax_regression.ipynb)
+* Exercise 3: Data preprocessing
+
+## What is machine learning? (1)
+![](figures/deepl.png)
+
+## What is machine learning? (2)
+"Machine learning is the subfield of computer science that gives computers the ability to learn without being explicitly programmed" -- Wikipedia
+
+\vspace{2ex}
+Example: spam detection \hfill 
+\scriptsize [\textcolor{gray}{J. Mayes, Machine learning 101}](https://docs.google.com/presentation/d/1kSuQyW5DTnkVaZEjGYCkfOxvzCqGEFzWBy4e9Uedd9k/preview?imm_mid=0f9b7e&cmp=em-data-na-na-newsltr_20171213&slide=id.g168a3288f7_0_58)
+\normalsize
+
+\begin{center}
+\includegraphics[width=0.9\textwidth]{figures/ml_example_spam.png} 
+\vspace{2ex}
+
+Manual feature engineering vs. automatic feature detection
+\end{center}
+
+## AI, ML, and DL
+"AI is the study of how to make computers perform things that, at the moment, people do better."
+\tiny \textcolor{gray}{Elaine Rich, Artificial intelligence, McGraw-Hill 1983} \normalsize 
+\vfill
+\tiny \hfill \textcolor{gray}{G. Marcus, E. Davis, Rebooting AI} \normalsize
+\begin{figure}
+\centering
+%![](figures/ai_ml_dl.pdf){width=70%}
+\includegraphics[width=0.7\textwidth]{figures/ai_ml_dl.pdf}
+\end{figure}
+
+\vfill
+"deep" in deep learning: artificial neural nets with many neurons and multiple layers of nonlinear processing units for feature extraction
+
+## Multivariate analysis: An early example from particle physics
+::: columns
+:::: {.column width=55%}
+![](figures/mva.png){width=99%}
+::::
+:::: {.column width=45%}
+* Signal: $e^+e^- \to W^+W^-$ 
+	* often 4 well separated hadron jets
+* Background: $e^+e^- \to qqgg$
+	* 4 less well separated hadron jets
+* Input variables based on jet structure, event shape, ... none by itself gives much separation.
+![](figures/mva_nn.png){width=85%}
+\tiny \textcolor{gray}{(Garrido, Juste and Martinez, ALEPH 96-144)} \normalsize
+::::
+:::
+
+## Applications of machine learning in physics
+
+* Particle physics: Particle identification / classification
+* Astronomy: Galaxy morphology classification
+* Chemistry and material science: predict properties of new molecules / materials
+* Many-body quantum matter: classification of quantum phases
+
+\vspace{3ex}
+\scriptsize [\textcolor{gray}{Machine learning and the physical sciences, arXiv:1903.10563}](https://arxiv.org/abs/1903.10563) \normalsize
+
+## Some successes and unsolved problems in AI
+::: columns
+:::: {.column width=50%}
+![](figures/ai_history.png){width=85%}
+
+\tiny \textcolor{gray}{M. Woolridge, The road to conscious machines} \normalsize
+
+::::
+:::: {.column width=50%}
+
+Impressive progress in certain fields:
+
+\small
+* Image recognition
+* Speech recognition
+* Recommendation systems
+* Automated translation
+* Analysis of medical data
+\normalsize
+\vfill
+
+How can we profit from these developments in physics?
+::::
+:::
+
+## The deep learning hype -- why now?
+Artificial neural networks are around for decades. Why did deep learning take off after 2012?
+
+\vspace{5ex}
+
+* Improved hardware -- graphical processing units [GPUs]
+* Large data sets (e.g. images) distributed via the Internet
+* Algorithmic advances
+
+
+## Different modeling approaches
+
+* Simple mathematical representation like linear regression. Favored by statisticians.
+* Complex deterministic models based on scientific understanding of the physical process. Favored by physicists.
+* Complex algorithms to make predictions that are derived from a huge number of past examples (“machine learning” as developed in the field of computer science). These are often black boxes.
+* Regression models that claim to reach causal conclusions. Used by economists.
+
+\tiny \textcolor{gray}{D. Spiegelhalter, The Art of Statistics – Learning from data} \normalsize
+
+
+## Machine learning: The "hello world" problem
+::: columns
+:::: {.column width=45%}
+
+Recognition of handwritten digits
+
+* MNIST database (Modified National Institute of Standards and Technology database)
+* 60,000 training images and 10,000 testing images labeled with correct answer
+* 28 pixel x 28 pixel
+* Algorithms have reached "near-human performance"
+* Smallest error rate (2018): 0.18\%
+
+::::
+:::: {.column width=55%}
+![](figures/mnist.png)
+
+\tiny 
+[\color{gray}{\texttt{https://en.wikipedia.org/wiki/MNIST\_database}}](https://en.wikipedia.org/wiki/MNIST_database)
+\normalsize
+
+::::
+:::
+
+## Machine learning: Image recognition
+ImageNet database
+
+* 14 million images, 22,000 categories
+* Since 2010, the annual ImageNet Large Scale Visual Recognition Challenge (ILSVRC): 1.4 million images, 1000 categories
+* In 2017, 29 of 38 competing teams got less than 5\% wrong
+
+\begin{figure}
+\centering
+\includegraphics[width=0.8\textwidth]{figures/imagenet.png}
+\end{figure}
+
+## ImageNet: Large Scale Visual Recognition Challenge
+
+\begin{figure}
+\centering
+\includegraphics[width=0.8\textwidth]{figures/imagenet_challenge.png}
+\end{figure}
+
+\vfill
+
+\scriptsize 
+\textcolor{gray}{O. Russakovsky et al, arXiv:1409.0575} 
+\normalsize
+
+## Adversarial attack
+
+\begin{figure}
+\centering
+\includegraphics[width=\textwidth]{figures/adversarial_attack.png}
+\end{figure}
+
+\vspace{3ex}
+\scriptsize [\textcolor{gray}{Ian J. Goodfellow, Jonathon Shlens, Christian Szegedy, arXiv:1412.6572v1}](https://arxiv.org/abs/1412.6572v1) \normalsize
+
+## Types of machine learning
+::: columns
+:::: {.column width=60%}
+Reinforcement learning
+
+\small 
+* The machine ("the agent") predicts a scalar reward given once in a while
+* Weak feedback
+\normalsize
+
+::::
+:::: {.column width=35%}
+\tiny [\textcolor{gray}{LeCun 2018, Power And Limits of Deep Learning}](https://www.youtube.com/watch?v=0tEhw5t6rhc) \normalsize
+![](figures/videogame.png)
+::::
+:::
+\vfill
+::: columns
+:::: {.column width=60%}
+
+\vspace{1em}
+Supervised learning
+
+\small
+* The machine predicts a category based on labeled training data 
+* Medium feedback
+\normalsize
+::::
+:::: {.column width=35%}
+![](figures/supervised_learning_car_plane.png)
+::::
+:::
+\vfill
+::: columns
+:::: {.column width=60%}
+
+\vspace{1em}
+Unsupervised learning
+
+\small 
+* Describe/find hidden structure from "unlabeled" data
+* Cluster data in different sub-groups with similar properties
+\normalsize
+::::
+:::: {.column width=35%}
+![](figures/anomaly_detection.png)
+::::
+:::
+
+## Books on machine learning (1)
+
+::: columns
+:::: {.column width=85%}
+Ian Goodfellow and Yoshua Bengio and Aaron Courville, \textit{Deep Learning}, free online [http://www.deeplearningbook.org/](http://www.deeplearningbook.org/)
+
+\vspace{8ex}
+
+Kevin Murphy, \textit{Probabilistic Machine Learning: An Introduction}, [draft pdf version](https://probml.github.io/pml-book/)
+
+\vspace{7ex}
+
+Aurelien Geron, \textit{Hands-On Machine Learning with Scikit-Learn and TensorFlow}
+
+::::
+:::: {.column width=15%}
+![](figures/deep_learning_book.png){width=65%}
+
+\vspace{3ex}
+
+![](figures/book-murphy.png){width=65%}
+
+\vspace{3ex}
+
+![](figures/hands_on_machine_learning.png){width=65%}
+
+::::
+:::
+
+## Books on machine learning (2)
+
+::: columns
+:::: {.column width=85%}
+Francois Chollet, \textit{Deep Learning with Python}
+
+\vspace{10ex}
+
+Martin Erdmann, Jonas Glombitza, Gregor Kasieczka, Uwe Klemradt, \textit{Deep Learning for Physics Research}
+
+::::
+:::: {.column width=15%}
+![](figures/deep_learning_with_python.png){width=65%}
+
+\vspace{3ex}
+
+![](figures/book_deep_learning_for_physics_research.png){width=65%}
+
+::::
+:::
+
+## Papers
+
+A high-bias, low-variance introduction to Machine Learning for physicists 
+
+[https://arxiv.org/abs/1803.08823](https://arxiv.org/abs/1803.08823)
+
+\vspace{3ex}
+
+Machine learning and the physical sciences
+
+[https://arxiv.org/abs/1903.10563](https://arxiv.org/abs/1903.10563)
+
+## Supervised learning in a nutshell
+* Supervised Machine Learning requires labeled training data, i.e., a training sample where for each event it is known whether it is a signal or background event. 
+* Each event is characterized by $n$ observables: $\vec x = (x_1, x_2, ..., x_n) \;$ \textcolor{gray}{"feature vector"} 
+
+\begin{figure}
+\centering
+\raisebox{-0.5\height}{\includegraphics[width=0.69\textwidth]{figures/supervised_nutshell.png}}
+\raisebox{-0.5\height}{\includegraphics[width=0.30\textwidth]{figures/loss_fct.png}}
+\end{figure}
+
+* Design function $y(\vec x, \vec w)$ with adjustable parameters $\vec w$ 
+* Design a loss function
+* Find best parameters which minimize loss
+
+
+
+## Supervised learning: classification and regression
+
+The codomain $Y$ of the function y: $X \to Y$ can be a set of labels or classes or a continuous domain, e.g., $\mathbb{R}$
+
+\vfill
+
+* $Y$ = finite set of labels $\quad \to \quad$ \textcolor{red}{classification}
+	* binary classification: $Y = \{0,1\}$
+	* multi-class classification: $Y = \{c_1, c_2, ..., c_n\}$
+* $Y$ = real numbers $\quad \to \quad$ \textcolor{red}{regression}
+
+\vfill
+
+\textcolor{gray}{"All the impressive achievements of deep learning amount to just curve fitting" \\[0.5cm]} 
+\footnotesize
+\textcolor{gray}{J. Pearl, Turing Award Winner 2011\\}
+\tiny
+[\color{gray}{To Build Truly Intelligent Machines, Teach Them Cause and Effect, Quantamagazine}](https://www.quantamagazine.org/to-build-truly-intelligent-machines-teach-them-cause-and-effect-20180515/)
+\normalsize
+
+## Classification: Learning decision boundaries
+
+\begin{figure}
+\centering
+\includegraphics{figures/decision_boundaries.png}
+\end{figure}
+
+## Supervised learning: Training, validation, and test sample
+* Decision boundary fixed with \textcolor{blue}{training sample}
+* Performance on training sample becomes better with more iterations
+* Danger of overtraining: Statistical fluctuations of the training sample will be learnt
+* \textcolor{blue}{Validation sample} = independent labeled data set not used for training $\rightarrow$ check for overtraining
+* Sign of overtraining: performance on validation sample becomes worse $\rightarrow$ Stop training when signs of overtraining are observed (early stopping)
+* Performance: apply classifier to independent \textcolor{blue}{test sample}
+* Often: test sample = validation sample (only small bias)
+
+## Supervised learning: Cross validation
+
+Rule of thumb if training data not expensive
+
+::: columns
+:::: {.column width=60%}
+* Training sample: 50%
+* Validation sample: 25% 
+* Test sample: 25%
+
+\vspace{2ex}
+
+Cross validation (efficient use of scarce training data)
+
+* Split training sample in $k$ independent subset $T_k$ of the full sample $T$
+* Train on $T \setminus T_k$ resulting in $k$ different classifiers
+* For each training event there is one classifier that didn't use this event for training
+* Validation results are then combined
+::::
+:::: {.column width=40%}
+\textcolor{gray}{Often test sample = validation sample (bias is rather small)}
+
+\vspace{10ex}
+![](figures/cross_val.png)
+::::
+:::
+
+## Often used loss functions
+::: columns
+:::: {.column width=45%}
+\textcolor{blue}{Square error loss}:
+
+* often used in regression
+
+::::
+:::: {.column width=55%}
+$$ E(y(\vec x, \vec w), t) = (y(\vec x, \vec w) - t)^2 $$
+::::
+:::
+
+\vfill 
+
+::: columns
+:::: {.column width=45%}
+\textcolor{blue}{Cross entropy}: 
+
+* $t \in \{0,1\}$
+* $y(\vec x, \vec w)$: predicted probability for outcome $t=1$
+* often used in classification
+
+::::
+:::: {.column width=55%}
+\begin{align*}
+E(y(\vec x, \vec w), t)  = & - t \log y(\vec x, \vec w) \\ &  - (1 - t) \log(1 - y(\vec x, \vec w))
+\end{align*}
+
+::::
+:::
+
+## More on entropy
+* Self-information of an event $x$: $I(x) = - \log p(x)$
+	* in units of **nats** (1 nat = information gained by observing an event of probability $1/e$)
+
+\vfill
+
+* Shannon entropy: $H(P) = - \sum p_i \log p_i$
+	* Expected amount of information in an event drawn from a distribution $P$
+	* Measure of the minimum of amount of bits needed on average to encode symbols drawn from a distribution
+
+\vfill
+
+* Cross entropy: $H(P,Q) = - E[\log Q] = - \sum p_i \log q_i$
+	* Can be interpreted as a measure of the amount of bits needed when a wrong distribution Q is assumed while the data actually follows a distribution P
+	* Measure of dissimilarity between distributions P and Q (i.e, a measure of how well the model Q describes the true distribution P)
+
+## Hypothesis testing
+::: columns
+:::: {.column width=55%}
+\includegraphics[width=\textwidth]{figures/signal_background_distr.png}
+::::
+:::: {.column width=45%}
+\vspace{2ex}
+test statistic 
+
+* a (usually scalar) variable which is a function of the data alone that can be used to test hypotheses
+* example: $\chi^2$ w.r.t. a theory curve
+
+::::
+:::
+
+\textcolor{gray}{$\epsilon_\mathrm{B} \equiv \alpha$}: "background efficiency", i.e., prob. to misclassify bckg. as signal
+
+\textcolor{gray}{$\epsilon_\mathrm{S} \equiv 1 - \beta$}: "signal efficiency"
+
+\begin{center}
+\begin{tabular}{ l l l}
+  & $H_0$ is true & $H_0$ is false (i.e., $H_1$ is true)\\
+  \hline
+ $H_0$ is rejected & Type I error ($\alpha$) & Correct decision ($1 - \beta$) \\
+ $H_0$ is not rejected & Correct decision ($1 - \alpha$) & Type II error ($\beta$) \\
+  \hline  
+\end{tabular}
+\end{center}
+
+
+## Neyman-Pearson Lemma
+
+The likelihood ratio
+
+$$ t(\vec x) = \frac{f(\vec x|H_1)}{f(\vec x|H_0)}  $$
+
+is an optimal test statistic, i.e., it provides highest "signal efficiency" $1-\beta$ for a given "background efficiency" $\alpha$. Accept hypothesis if $t(\vec x) > c$.
+
+\vfill
+
+Problem: the underlying pdf's are almost never known explicitly.
+
+\vfill
+
+Two approaches 
+
+1. Estimate signal and background pdf's and construct test statistic based on Neyman-Pearson lemma
+
+2. Decision boundaries determined directly without approximating the pdf's (linear discriminants, decision trees, neural networks, ...)
+
+
+## Estimating PDFs from Histograms?
+
+\begin{center}
+\includegraphics[width=0.8\textwidth]{figures/pdf_from_2d_histogram.png}
+$\color{gray} \text{approximate PDF by} \; N(x,y|S) \; \text{and} \; N(x,y|B)$
+\end{center}
+
+$M$ bins per variable in $d$ dimensions: $M^d$ cells$\to$ hard to generate enough training data (often not practical for $d > 1$)
+
+
+In general in machine learning, problems related to a large number of dimensions of the feature space are referred to as the \textcolor{red}{"curse of dimensionality"}
+
+## Na$\text{\"i}$ve Bayesian Classifier (also called "Projected Likelihood Classification")
+
+Application of the Neyman-Pearson lemma (ignoring correlations between the $x_i$):
+
+$$ f(x_1, x_2, ..., x_n) \quad \mbox{approximated as} \quad L = f_1(x_1) \cdot f_2(x_2) \cdot ... \cdot f_n(x_n) $$
+\begin{align*}
+\mbox{where} \quad 
+f_1(x_1) & = \int \mathrm dx_2 \mathrm dx_3 ... \mathrm dx_n\; f(x_1, x_2, ..., x_n) \\
+f_2(x_2) & = \int \mathrm dx_1 \mathrm dx_3 ... \mathrm dx_n\; f(x_1, x_2, ..., x_n)  \\
+\vdots
+\end{align*}
+Classification of feature vector $x$:
+$$
+y(\vec x) = \frac{L_\mathrm{s}(\vec x)}{L_\mathrm{s}(\vec x) + L_\mathrm{b}(\vec x)} = \frac{1}{1 + L_\mathrm{b}(\vec x) / L_\mathrm{s}(\vec x)} 
+$$ 
+
+
+Performance not optimal if true PDF does not factorize
+
+## k-Nearest Neighbor Method (1)
+
+$k$-NN classifier:
+
+* Estimates probability density around the input vector
+* $p(\vec x|S)$ and $p(\vec x|B)$ are approximated by the number of signal and background events in the training sample that lie in a small volume around the point $\vec x$
+
+\vspace{2ex}
+
+Algorithms finds $k$ nearest neighbors:
+$$ k = k_s + k_b $$
+
+Probability for the event to be of signal type:
+
+$$ p_s(\vec x) = \frac{k_s(\vec x)}{k_s(\vec x) + k_b(\vec x)} $$
+
+## k-Nearest Neighbor Method (2)
+
+::: columns
+:::: {.column width=60%}
+Simplest choice for distance measure in feature space is the Euclidean distance:
+$$ R = |\vec x - \vec y|$$
+
+Better: take correlations between variables into account:
+
+$$ R = \sqrt{(\vec{x}-\vec{y})^T \mat{V}^{-1} (\vec{x}-\vec{y})} $$ 
+$$ \mat{V} = \text{covariance matrix}, R = \text{"Mahalanobis distance"}$$
+
+
+::::
+:::: {.column width=40%}
+![](figures/knn.png)
+::::
+:::
+
+\vfill
+
+The $k$-NN classifier has best performance when the boundary that separates signal and background events has irregular features that cannot be easily approximated by parametric learning methods.
+
+
+## Fisher Linear Discriminant
+
+Linear discriminant is simple. Can still be optimal if amount of training data is limited. 
+
+
+Ansatz for test statistic: $$ y(\vec x) = \sum_{i=1}^n w_i x_i = \vec w^\intercal \vec x $$
+
+
+Choose parameters $w_i$ so that separation between signal and background distribution is maximum.
+
+\vfill
+
+Need to define "separation".
+
+
+::: columns
+:::: {.column width=45%}
+\begin{center}
+Fisher: maximize $$ J(\vec w) = \frac{(\tau_s - \tau_b)^2}{\Sigma_s^2 + \Sigma_b^2} $$
+\end{center}
+::::
+:::: {.column width=55%}
+![](figures/fisher.png)
+::::
+:::
+
+## Fisher Linear Discriminant: Determining the Coefficients $w_i$
+
+::: columns
+:::: {.column width=60%}
+Coefficients are obtained from: $$ \frac{\partial J}{\partial w_i} = 0 $$
+
+\vspace{2ex}
+
+Linear decision boundaries
+
+\vspace{5ex}
+
+Weight vector $\vec w$ can be interpreted as a direction in feature space onto which the events are projected.
+::::
+:::: {.column width=40%}
+![](figures/fisher_linear_decision_boundary.png)
+::::
+:::
+
+
+
+
+## Linear regression revisited
+
+\vfill 
+
+::: columns
+:::: {.column width=50%}
+\small \textcolor{gray}{"Galton family heights data": \\ origin of the term "regression"} \normalsize
+![](figures/03_ml_basics_galton_linear_regression_iminuit.pdf)
+
+::::
+:::: {.column width=50%}
+
+* data: $\{x_i,y_i\}$ \
+* objective: predict $y = f(x)$ 
+* model: $f(x; \vec \theta) = m x + b, \quad \vec \theta = (m, b)$ 
+* loss function: $J(\theta|x,y) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i))^2$ 
+* model training: optimal parameters $\hat{\vec{\theta}} = \mathrm{arg\,min} \, J(\vec \theta)$
+
+::::
+:::
+
+## Linear regression
+
+* Data: vectors with $p$ components ("features"): $\vec x = (x_1, ..., x_p)$
+* $n$ observations: $\{\vec x_i, y_i\}, \quad i = 1, ..., n$
+* Prediction for given vector $x$: 
+	$$ y = w_0 + w_1 x_1 + w_2 x_2 + ... + w_p x_p \equiv \vec w^\intercal \vec x \quad \text{where } x_0 := 1 $$
+
+* Find weights that minimze loss function:
+	$$\hat{\vec{w}} = \underset{\vec w}{\min} \sum_{i=1}^{n} (\vec w^\intercal \vec x_i - y_i)^2$$
+
+*  In case of linear regression closed-form solution exists:
+	$$ \hat{\vec{w}} = (\mat{X}^\intercal \mat{X})^{-1} \mat{X}^\intercal \vec y \quad \text{where} \; X \in \mathbb{R}^{n \times p}$$
+
+* $X$ is called the design matrix, row $i$ of $X$ is $\vec x_i$ 
+
+## Linear regression with regularization
+
+::: columns
+:::: {.column width=45%}
+* Standard loss function
+	$$ C(\vec w) = \sum_{i=1}^{n} (\vec w^\intercal \vec x_i - y_i)^2 $$
+
+* Ridge regression
+	$$ C(\vec w) = \sum_{i=1}^{n} (\vec w^\intercal \vec x_i - y_i)^2 + \lambda |\vec w|^2$$
+
+* LASSO regression
+	$$ C(\vec w) = \sum_{i=1}^{n} (\vec w^\intercal \vec x_i - y_i)^2 + \lambda |\vec w| $$
+
+::::
+:::: {.column width=55%}
+
+\vfill 
+
+![](figures/L1vsL2.pdf)
+\small \textcolor{gray}{LASSO regression tends to give sparse solutions (many components $w_j = 0$). This is why LASSO regression is also called sparse regression.} \normalsize
+::::
+:::
+
+## Logistic regression (1)
+
+* Consider binary classification task, e.g., $y_i \in \{0,1\}$
+* Objective: Predict probability for outcome $y=1$ given an observation $\vec x$
+* Starting with linear "score" 
+	$$ s = w_0 + w_1 x_1 + w_2 x_2 + ... + w_p x_p \equiv \vec w^\intercal \vec x$$
+* Define function that translates $s$ into a quantity that has the properties of a probability
+	$$ \sigma(s) = \frac{1}{1+e^{-s}} $$
+* We would like to determine the optimal weights for a given training data set. They result from the maximum-likelihood principle.
+
+## Logistic regression (2)
+
+* Consider feature vector $\vec x$. For a given set of weights $\vec w$ the model predicts 
+	* a probability $p(1|\vec w) = \sigma(\vec w^\intercal \vec x)$ for outcome $y=1$
+	* a probabiltiy $p(0|\vec w) = 1 - \sigma(\vec w^\intercal \vec x)$ for outcome $y=0$
+* The probability $p(y_i | \vec w)$ defines the likelihood $L_i(\vec w) = p(y_i | \vec w)$ (the likelihood is a function of the parameters $\vec w$ and the observations $y_i$ are fixed).
+* Likelihood for the full data sample ($n$ observations)
+	$$ L(\vec w) = \prod_{i=1}^n L_i(\vec w) = \prod_{i=1}^n \sigma(\vec w^\intercal \vec x)^{y_i} \,(1-\sigma(\vec w^\intercal \vec x))^{1-y_i} $$
+* Maximizing the log-likelihood $\ln L(\vec w)$ corresponds to minimizing the loss function
+	$$ C(\vec w) = - \ln L(\vec w) = \sum_{i=1}^n - y_i \ln \sigma(\vec w^\intercal \vec x) -
+(1-y_i) \ln(1-\sigma(\vec w^\intercal \vec x))$$
+* This is nothing else but the cross-entropy loss function
+
+## scikit-learn
+
+::: columns
+:::: {.column width=70%}
+* Free software machine learning library for Python 
+* Initial release: 2007
+* features various classification, regression and clustering algorithms including k-nearest neighbors, multi-layer perceptrons, support vector machines, random forests, gradient boosting, k-means
+* Scikit-learn is one of the most popular machine learning libraries on GitHub
+* [https://scikit-learn.org/](https://scikit-learn.org/)
+::::
+:::: {.column width=30%}
+\vspace{7ex}
+\begin{figure}
+\centering
+\includegraphics[width=0.85\textwidth]{figures/scikit-learn.png}
+\end{figure}
+::::
+:::
+
+
+## Example 1 - Probability of passing an exam (logistic regression) (1)
+
+Objective: predict the probability that someone passes an exam based on the number of hours studying 
+
+$$ p_\mathrm{pass} = \sigma(s) = \frac{1}{1+e^{-s}}, \quad s = w_1 t + w_0, \quad t = \text{\# hours}$$
+
+::: columns
+:::: {.column width=40%}
+* Data set: \
+	* preparation $t$ time in hours 
+	* passed / not passes (0/1)
+* Parameters need to be determined through numerical minimization
+	* $w_0 = -4.0777$
+	* $w_1 = 1.5046$
+
+
+\vspace{1.5ex}
+\footnotesize 
+[\textcolor{gray}{03\_ml\_basics\_logistic\_regression.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/03_ml_basics_logistic_regression.ipynb)
+\normalsize
+::::
+:::: {.column width=60%}
+![](figures/03_ml_basics_logistic_regression.pdf){width=90%}
+::::
+:::
+
+## Example 1 - Probability of passing an exam (logistic regression) (2)
+
+\footnotesize
+\textcolor{gray}{Read data from file:}
+```python
+# data: 1. hours studies, 2. passed (0/1)  
+df = pd.read_csv(filename, engine='python', sep='\s+')
+x_tmp = df['hours_studied'].values
+x = np.reshape(x_tmp, (-1, 1))
+y = df['passed'].values
+```
+\vfill
+\textcolor{gray}{Fit the data:}
+```python
+from sklearn.linear_model import LogisticRegression
+clf = LogisticRegression(penalty='none', fit_intercept=True)
+clf.fit(x, y);
+```
+\vfill
+\textcolor{gray}{Calculate predictions:} 
+```python
+hours_studied_tmp = np.linspace(0., 6., 1000)
+hours_studied = np.reshape(hours_studied_tmp, (-1, 1))
+y_pred = clf.predict_proba(hours_studied)
+```
+\normalsize
+
+## Precision and recall
+
+::: columns
+:::: {.column width=50%}
+\textcolor{blue}{Precision:}\
+Fraction of correctly classified instances among all instances that obtain a certain class label.
+
+$$ \text{precision} = \frac{\text{TP}}{\text{TP} + \text{FP}} $$
+
+\begin{center} 
+\textcolor{gray}{"purity"}
+\end{center}
+
+::::
+:::: {.column width=50%}
+\textcolor{blue}{Recall:}\
+Fraction of positive instances that are correctly classified.
+\vspace{2.9ex}
+
+$$ \text{recall} = \frac{\text{TP}}{\text{TP} + \text{FN}} $$
+
+\begin{center} 
+\textcolor{gray}{"efficiency"}
+\end{center}
+
+::::
+:::
+\vfill
+\begin{center}
+\textcolor{gray}{TP: true positives, FP: false positives, FN: false negatives}
+\end{center}
+
+## Example 2: Heart disease data set (logistic regression) (1)
+
+\scriptsize
+\textcolor{gray}{Read data:}
+```python
+filename = "https://www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/data/heart.csv"
+df = pd.read_csv(filename)
+df
+```
+\vfill
+![](figures/heart_table.png){width=70%}
+\normalsize
+\vspace{1.5ex}
+\footnotesize 
+[\textcolor{gray}{03\_ml\_basics\_log\_regr\_heart\_disease.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/03_ml_basics_log_regr_heart_disease.ipynb)
+\normalsize
+
+## Example 2: Heart disease data set (logistic regression) (2)
+\footnotesize
+
+\textcolor{gray}{Define array of labels and feature vectors}
+```python
+y = df['target'].values
+X = df[[col for col in df.columns if col!="target"]]
+```
+\vfill
+\textcolor{gray}{Generate training and test data sets}
+```python
+from sklearn.model_selection import train_test_split
+X_train, X_test, y_train, y_test 
+		= train_test_split(X, y, test_size=0.5, shuffle=True)
+```
+\vfill
+\textcolor{gray}{Fit the model}
+```python
+from sklearn.linear_model import LogisticRegression
+lr = LogisticRegression(penalty='none', 
+		fit_intercept=True, max_iter=1000, tol=1E-5)
+lr.fit(X_train, y_train)
+```
+\normalsize
+
+## Example 2: Heart disease data set (logistic regression) (3)
+\footnotesize
+\textcolor{gray}{Test predictions on test data set:}
+```python
+from sklearn.metrics import classification_report
+y_pred_lr = lr.predict(X_test)
+print(classification_report(y_test, y_pred_lr))
+```
+\vfill
+\textcolor{gray}{Output:}
+```
+              precision    recall  f1-score   support
+
+           0       0.75      0.86      0.80        63
+           1       0.89      0.80      0.84        89
+
+    accuracy                           0.82       152
+   macro avg       0.82      0.83      0.82       152
+weighted avg       0.83      0.82      0.82       152
+```
+
+## Example 2: Heart disease data set (logistic regression) (4)
+
+\textcolor{gray}{Compare to another classifier using the \textit{receiver operating characteristic} (ROC) curve}
+\vfill
+\textcolor{gray}{Let's take the random forest classifier}
+\footnotesize
+```python
+from sklearn.ensemble import RandomForestClassifier
+rf = RandomForestClassifier(max_depth=3)
+rf.fit(X_train, y_train)
+```
+\normalsize
+\vfill
+\textcolor{gray}{Use \texttt{roc\_curve} from scikit-learn}
+\footnotesize
+```python
+from sklearn.metrics import roc_curve
+
+y_pred_prob_lr = lr.predict_proba(X_test) # predicted probabilities
+fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_prob_lr[:,1])
+
+y_pred_prob_rf = rf.predict_proba(X_test) # predicted probabilities
+fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_prob_rf[:,1])
+
+```
+\normalsize
+
+## Example 2: Heart disease data set (logistic regression) (5)
+::: columns
+:::: {.column width=50%}
+\scriptsize
+```python
+plt.plot(tpr_lr, 1-fpr_lr, label="log. regression")
+plt.plot(tpr_rf, 1-fpr_rf, label="random forest")
+```
+\vspace{5ex}
+
+\normalsize
+\textcolor{gray}{Classifiers can be compared with the \textit{area under curve} (AUC) score.}
+\scriptsize
+```python
+from sklearn.metrics import roc_auc_score
+auc_lr = roc_auc_score(y_test,y_pred_lr)
+auc_rf = roc_auc_score(y_test,y_pred_rf)
+print(f"AUC scores: {auc_lr:.2f}, {auc_knn:.2f}")
+```
+\vspace{5ex}
+\normalsize
+\textcolor{gray}{This gives}
+\scriptsize
+```
+AUC scores: 0.82, 0.83
+```
+\normalsize
+
+::::
+:::: {.column width=50%}
+\begin{figure}
+\centering
+\includegraphics[width=0.96\textwidth]{figures/03_ml_basics_log_regr_heart_disease.pdf}
+\end{figure}
+::::
+:::
+
+## Multinomial logistic regression: Softmax function
+
+In the previous example we considered two classes (0, 1). For multi-class classification, the logistic function can generalized to the softmax function.
+\vfill
+Now consider $k$ classes and let $s_i$ be the score for class $i$: $\vec s = (s_1, ..., s_k)$
+\vfill
+A probability for class $i$ can be predicted with the softmax function:
+	$$ \sigma(\vec s)_i = \frac{e^{s_i}}{\sum_{j=1}^k e^{s_j}} \quad \text{ for } \quad i = 1, ... , k $$
+The softmax functions is often used as the last activation function of a neural network in order to predict probabilities in a classification task.
+\vfill
+Multinomial logistic regression is also known as softmax regression.
+
+## Example 3: Iris data set (softmax regression) (1)
+
+Iris flower data set
+
+* Introduced 1936 in a paper by Ronald Fisher
+* Task: classify flowers
+* Three species: iris setosa, iris virginica and iris versicolor
+* Four features: petal width and length, sepal width/length, in centimeters
+
+::: columns
+:::: {.column width=40%}
+\begin{figure}
+\centering
+\includegraphics[width=0.95\textwidth]{figures/iris_dataset.png}
+\end{figure}
+::::
+:::: {.column width=60%}
+
+\vspace{2ex}
+
+\footnotesize 
+[\textcolor{gray}{03\_ml\_basics\_iris\_softmax\_regression.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/03_ml_basics_iris_softmax_regression.ipynb)
+
+\vspace{19ex}
+
+\scriptsize
+[https://archive.ics.uci.edu/ml/datasets/Iris](https://archive.ics.uci.edu/ml/datasets/Iris)
+
+[https://en.wikipedia.org/wiki/Iris_flower_data_set](https://en.wikipedia.org/wiki/Iris_flower_data_set)
+\normalsize
+::::
+:::
+
+## Example 3: Iris data set (softmax regression) (2)
+
+\textcolor{gray}{Get data set}
+\footnotesize
+```python
+# import some data to play with
+# columns: Sepal Length, Sepal Width, Petal Length and Petal Width
+iris = datasets.load_iris()
+X = iris.data
+y = iris.target
+
+# split data into training and test data sets
+x_train, x_test, y_train, y_test = 
+			train_test_split(X, y, test_size=0.5, random_state=42)
+```
+\normalsize
+\vfill
+
+\textcolor{gray}{Softmax regression}
+\footnotesize
+```python
+from sklearn.linear_model import LogisticRegression
+log_reg = LogisticRegression(multi_class='multinomial', penalty='none')
+log_reg.fit(x_train, y_train);
+```
+\normalsize
+
+## Example 3 : Iris data set (softmax regression) (3)
+
+::: columns
+:::: {.column width=70%}
+\textcolor{gray}{Accuracy and confusion matrix for different classifiers}
+\footnotesize
+```python
+for clf in [log_reg, kn_neigh, fisher_ld]:
+    y_pred = clf.predict(x_test)
+    acc = accuracy_score(y_test, y_pred)
+    print(type(clf).__name__)
+    print(f"accuracy: {acc:0.2f}")
+
+    # confusion matrix: 
+    # columns: true class, row: predicted class
+    print(confusion_matrix(y_test, y_pred),"\n")
+```
+\normalsize
+::::
+:::: {.column width=30%}
+
+\footnotesize
+```
+LogisticRegression
+accuracy: 0.96
+[[29  0  0]
+ [ 0 23  0]
+ [ 0  3 20]] 
+
+KNeighborsClassifier
+accuracy: 0.95
+[[29  0  0]
+ [ 0 23  0]
+ [ 0  4 19]] 
+
+LinearDiscriminantAnalysis
+accuracy: 0.99
+[[29  0  0]
+ [ 0 23  0]
+ [ 0  1 22]] 
+```
+\normalsize
+::::
+:::
+
+## General remarks on multi-variate analyses (MVAs)
+
+* MVA Methods
+	* More effective than classic cut-based analyses
+	* Take correlations of input variables into account
+\vfill
+* Important: find good input variables for MVA methods
+	* Good separation power between S and B
+	* No strong correlation among variables
+	* No correlation with the parameters you try to measure in your signal sample!
+\vfill
+* Pre-processing
+	* Apply obvious variable transformations and let MVA method do the rest
+	* Make use of obvious symmetries: if e.g. a particle production process is symmetric in polar angle $\theta$ use $|\cos \theta|$ and not $\cos \theta$ as input variable
+	* It is generally useful to bring all input variables to a similar numerical range 
+
+## Example of feature transformation
+
+\begin{figure}
+\centering
+\includegraphics[width=0.95\textwidth]{figures/feature_transformation.png}
+\end{figure}
+
+## Exercise 1: Classification of air showers measured with the MAGIC telescope
+
+::: columns
+:::: {.column width=50%}
+
+\small
+* Cosmic gamma rays (30 GeV - 30 TeV).
+* Cherenkov light from air showers
+* Background: air showers caused by hadrons.
+\normalsize
+
+\begin{figure}
+\centering
+\includegraphics[width=0.85\textwidth]{figures/magic_photo_small.png}
+\end{figure}
+::::
+:::: {.column width=50%}
+![](figures/magic_sketch.png)
+::::
+:::
+
+## Exercise 1: Classification of air showers measured with the MAGIC telescope
+\begin{figure}
+\centering
+\includegraphics[width=0.75\textwidth]{figures/magic_shower_em_had_small.png}
+\end{figure}
+::: columns
+:::: {.column width=50%}
+\begin{center}
+Gamma shower
+\end{center}
+::::
+:::: {.column width=50%}
+\begin{center}
+Hadronic shower
+\end{center}
+::::
+:::
+
+## Exercise 1: Classification of air showers measured with the MAGIC telescope
+\begin{figure}
+\centering
+\includegraphics[width=0.95\textwidth]{figures/magic_shower_parameters.png}
+\end{figure}
+
+## Exercise 1: Classification of air showers measured with the MAGIC telescope
+MAGIC data set \
+\tiny
+[\textcolor{gray}{https://archive.ics.uci.edu/ml/datasets/magic+gamma+telescope}](https://archive.ics.uci.edu/ml/datasets/magic+gamma+telescope)
+\normalsize
+
+\scriptsize
+```
+1.  fLength:  continuous  # major axis of ellipse [mm]
+2.  fWidth:   continuous  # minor axis of ellipse [mm] 
+3.  fSize:    continuous  # 10-log of sum of content of all pixels [in #phot]
+4.  fConc:    continuous  # ratio of sum of two highest pixels over fSize  [ratio]
+5.  fConc1:   continuous  # ratio of highest pixel over fSize  [ratio]
+6.  fAsym:    continuous  # dist. from highest pixel to center, proj. onto major axis [mm]
+7.  fM3Long:  continuous  # 3rd root of third moment along major axis  [mm] 
+8.  fM3Trans: continuous  # 3rd root of third moment along minor axis  [mm]
+9.  fAlpha:   continuous  # angle of major axis with vector to origin [deg]
+10. fDist:    continuous  # distance from origin to center of ellipse [mm]
+11. class:    g,h         # gamma (signal), hadron (background)
+
+g = gamma (signal):     12332
+h = hadron (background): 6688
+
+For technical reasons, the number of h events is underestimated.
+In the real data, the h class represents the majority of the events.
+```
+\normalsize
+
+## Exercise 1: Classification of air showers measured with the MAGIC telescope
+
+\small
+[\textcolor{gray}{03\_ml\_basics\_ex\_1\_magic.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/03_ml_basics_ex_1_magic.ipynb)
+\normalsize
+
+a) Create for each variable a figure with a plot for gammas and hadrons overlayed.
+b) Create training and test data set. The test data should amount to 50% of the total data set.
+c) Define the logistic regressor and fit the training data
+d) Determine the model accuracy and the AUC score
+e) Plot the ROC curve (background rejection vs signal efficiency)
+
+## Exercise 2: Hand-written digit recognition with logistic regression
+
+\small
+[\textcolor{gray}{03\_ml\_basics\_ex\_2\_mnist\_softmax\_regression.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/03_ml_basics_ex_2_mnist_softmax_regression.ipynb)
+\normalsize
+
+a) Define logistic regressor from scikit-learn and fit data
+b) Use \texttt{classification\_report} from scikit-learn to determine precision and recall
+c) Read in a hand-written digit and classify it. Print the probabilities for each digit. Determine the digit with the highest probability.
+d) (Optional) Create you own hand-written digit with a program like gimp and check what the classifier does
+
+\begin{figure}
+\centering
+\includegraphics[width=0.85\textwidth]{figures/handwritten_digits.png}
+\end{figure}
+
+Hint: You can install required packages on the jupyter hub server like so:
+\scriptsize
+```
+!pip3 install --user pypng 
+```
+\normalsize
+
+
+## Exercise 3: Data preprocessing
+
+a) Read the description of the [`sklearn.preprocessing`](https://scikit-learn.org/stable/modules/preprocessing.html) package. 
+
+b) Start from the example notebook on the logistic regression for the heart disease data set ([03_ml_basics_log_regr_heart_disease.ipynb](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/03_ml_basics_log_regr_heart_disease.ipynb)). Pre-process the heart disease data set according to the given example. Does preprocessing make a difference in this case?  
+
diff --git a/slides/neural_networks.md b/slides/neural_networks.md
new file mode 100644
index 0000000..1ffcfc5
--- /dev/null
+++ b/slides/neural_networks.md
@@ -0,0 +1,808 @@
+---
+title: |
+  | Introduction to Data Analysis and Machine Learning in Physics:  
+  | 5. Neural networks  
+
+author: "Martino Borsato, Jörg Marks, Klaus Reygers"
+date: "Studierendentage, 11-14 April 2022"
+---
+
+## Exercises
+
+* Exercise 1: Learn XOR with a MLP
+	* [`05_neural_networks_ex_1_xor.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_1_xor.ipynb)
+* Exercise 2: Visualising decision boundaries of classifiers
+	* [`05_neural_networks_ex_2_decision_boundaries.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_2_decision_boundaries.ipynb)
+* Exercise 3: Boston house prices (MLP regression)
+	* [`05_neural_networks_ex_3_boston_house_prices.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_3_boston_house_prices.ipynb)
+* Exercise 4: Training a digit-classification neural network on the MNIST dataset using Keras
+	* [`05_neural_networks_ex_4_mnist_keras_train.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_4_mnist_keras_train.ipynb)
+
+
+## Perceptron (1)
+
+::: columns
+:::: {.column width=65%}
+\begin{center}
+\includegraphics[width=0.40\textwidth]{figures/perceptron_weighted_sum.png}
+\vspace{1ex}
+\includegraphics[width=0.75\textwidth]{figures/perceptron_retina.png}
+\end{center}
+::::
+:::: {.column width=35%}
+$$h(\vec x) = \begin{cases}1 & \text{if }\ \vec w \cdot \vec x + b > 0,\\0 & \text{otherwise}\end{cases}$$
+\begin{center}
+\includegraphics[width=0.95\textwidth]{figures/perceptron_photo.png}
+\tiny
+\textcolor{gray}{Mark 1 Perceptron. Frank Rosenblatt (1961)}
+\normalsize
+\end{center}
+::::
+:::
+\footnotesize
+\vspace{2ex}
+\textcolor{gray}{The perceptron was designed for image recognition. It was first implemented in hardware (400 photocells, weights = potentiometer settings).}
+\normalsize
+
+## Perceptron (2)
+::: columns
+:::: {.column width=60%}
+* McCulloch–Pitts (MCP) neuron (1943)
+	* First mathematical model of a biological neuron
+	* Boolean input
+	* Equal weights for all inputs
+	* Threshold hardcoded
+* Improvements by Rosenblatt
+	* Different weights for inputs
+	* Algorithm to update weights and threshold given labeled training data
+
+\vfill
+
+Shortcoming of the perceptron: \newline 
+it cannot learn the XOR function \newline
+\tiny \textcolor{gray}{Minsky, Papert, 1969} \normalsize
+
+::::
+:::: {.column width=40%}
+![](figures/perceptron_with_threshold.png){width=80%}
+![](figures/xor.png)
+\small \textcolor{gray}{XOR: not linearly separable } \normalsize
+
+::::
+:::
+
+## The biological inspiration: the neuron
+
+\begin{figure}
+\centering
+\includegraphics[width=0.95\textwidth]{figures/neuron.png}
+\end{figure}
+
+## Non-linear transfer / activation function
+
+Discriminant: $$ y(\vec x) = h\left( w_0 + \sum_{i=1}^n w_i x_i \right) $$
+
+Examples for function $h$: \newline
+$$ \frac{1}{1+e^{-x}} \; \text{("sigmoid" or "logistic" function)}, \quad \tanh x $$ 
+
+::: columns
+:::: {.column width=50%}
+\begin{figure}
+\centering
+\includegraphics[width=0.75\textwidth]{figures/logistic_fct.png}
+\end{figure}
+::::
+:::: {.column width=50%}
+\vspace{3ex}
+Non-linear activation function needed in neural networks when feature space is not linearly separable.
+\newline
+
+\small 
+\textcolor{gray}{Neural net with linear activation functions is just a perceptron}
+\normalsize
+::::
+:::
+
+## Feedforward neural network with one hidden layer
+::: columns
+:::: {.column width=60%}
+![](figures/mlp.png){width=80%}
+::::
+:::: {.column width=40%}
+$$ \phi_i(\vec x) = h\left(w_{i0}^{(1)} + \sum_{j=1}^n w_{ij}^{(1)} x_j\right) $$
+\vfill
+$$ y(\vec x) = h\left( w_{10}^{(2)} + \sum_{j=1}^m w_{1j}^{(2)} \phi_j(\vec x)\right) $$
+\vfill
+\vspace{2ex}
+\footnotesize 
+\textcolor{gray}{superscripts indicates layer number, i.e., $w_{ij}^{(1)}$ refers to the input weights of neuron $i$ in the hidden layer (= layer 1).}
+\normalsize
+
+::::
+:::
+\begin{center} 
+Straightforward to generalize to multiple hidden layers
+\end{center}
+
+## Neural network output and decision boundaries
+::: columns
+:::: {.column width=75%}
+\begin{figure}
+\centering
+\includegraphics[width=\textwidth]{figures/nn_decision_boundary.png}
+\end{figure}
+::::
+:::: {.column width=25%}
+\vspace{3ex}
+\footnotesize 
+\textcolor{gray}{P. Bhat, Multivariate Analysis Methods in Particle Physics, inspirehep.net/record/879273}
+\normalsize
+::::
+:::
+
+## Fun with neural nets in the browser
+\begin{figure}
+\centering
+\includegraphics[width=\textwidth]{figures/tf_playground.png}
+\end{figure}
+\tiny
+[\textcolor{gray}{http://playground.tensorflow.org}](http://playground.tensorflow.org) 
+\normalsize
+
+## Backpropagation (1)
+Start with an initial guess $\vec w_0$ for the weights an then update weights after each training event:
+$$ \vec w^{(\tau+1)} = \vec w^{(\tau)} - \eta \nabla E_a(\vec w^{(\tau)}), \quad \eta = \text{learning rate}$$
+
+Gradient descent:
+\begin{figure}
+\centering
+\includegraphics[width=0.46\textwidth]{figures/gradient_descent.png}
+\end{figure}
+
+## Backpropagation (2)
+::: columns
+:::: {.column width=40%}
+\vspace{6ex}
+![](figures/mlp.png){width=100%}
+::::
+:::: {.column width=60%}
+Let's write network output as follows:
+\begin{align*}
+y(\vec x) &= h(u(\vec x)); \quad u(\vec x) = \sum_{j=0}^m w_{1j}^{(2)} \phi_j(\vec x) \\
+\phi_j(\vec x) &= h\left( \sum_{k=0}^n w_{jk}^{(1)} x_k\right) 
+\equiv h\left( v_j(\vec x) \right)
+\end{align*}
+
+For $E_a = \frac{1}{2} (y_a - t_a)^2$ one obtains for the weights from hidden layer to output:
+\begin{align*} 
+\frac{\partial E_a}{\partial w_{1j}^{(2)}} &= (y_a -t_a) h'(u(\vec x_a)) \frac{\partial u}{\partial w_{1j}^{(2)}} \\
+&= (y_a -t_a) h'(u(\vec x_a)) \phi_j(\vec x_a)
+\end{align*}
+::::
+:::
+\vspace{2ex}
+Further application of the chain rule gives weights from input to hidden layer.
+
+## Backpropagation (3)
+Backpropagation summary
+
+* Make prediction for a given training instance (forward pass)
+* Calculate error (value of loss function)
+* Go backwards and determine the contribution of each weight (reverse pass)
+* Adjust the weights to reduce the error
+
+\vfill
+
+Practical considerations:
+
+* Nowadays, people will implements neural networks with frameworks like Keras or TensorFlow
+* No need to implement backpropagation yourself
+* TensorFlow efficiently calculates gradient function based on a kind of symbolic differentiation
+
+
+## More on gradient descent
+
+::: columns
+:::: {.column width=60%}
+* Stochastic gradient descent
+	* just uses one training event at a time
+	* fast, but quite irregular approach to the minimum
+	* can help escape local minima
+	* one can decrease learning rate to settle at the minimum ("simulated annealing")
+* Batch gradient descent
+	* use entire training sample to calculate gradient of loss function
+	* computationally expensive
+* Mini-batch gradient descent
+	* calculate gradient for a random sub-sample of the training set
+
+::::
+:::: {.column width=40%}
+\begin{figure}
+\centering
+\includegraphics[width=0.7\textwidth]{figures/stochastic_gradient_descent.png}
+\end{figure}
+\begin{figure}
+\centering
+\includegraphics[width=\textwidth]{figures/gradient_descent_cmp.png}
+\end{figure}
+::::
+:::
+
+## Universal approximation theorem
+
+::: columns
+:::: {.column width=60%}
+"A feed-forward network with a single hidden layer containing a finite number of neurons (i.e., a multilayer perceptron), can approximate continuous functions on compact subsets of $\mathbb{R}^n$."
+
+\vspace{5ex}
+
+One of the first versions of the theorem was proved by George Cybenko in 1989 for sigmoid activation functions
+
+\vspace{5ex}
+
+The theorem does not touch upon the algorithmic learnability of those parameters
+
+::::
+:::: {.column width=40%}
+\begin{figure}
+\centering
+\includegraphics[width=\textwidth]{figures/ann.png}
+\end{figure}
+::::
+:::
+
+## Deep neural networks
+Deep networks: many hidden layers with large number of neurons 
+
+::: columns
+:::: {.column width=45%}
+* Challenges
+	* Hard too train ("vanishing gradient problem")
+	* Training slow
+	* Risk of overtraining
+::::
+:::: {.column width=55%}
+* Big progress in recent years
+	* Interest in NN waned before ca. 2006
+	* Milestone: paper by G. Hinton (2006): "learning for deep belief nets"
+	* Image recognition, AlphaGo, …
+	* Soon: self-driving cars, …
+::::
+:::
+\begin{figure}
+\centering
+\includegraphics[width=0.5\textwidth]{figures/dnn.png}
+\end{figure}
+
+## Drawbacks of the sigmoid activation function
+
+::: columns
+:::: {.column width=50%}
+\includegraphics[width=.75\textwidth]{figures/sigmoid.png}
+::::
+:::: {.column width=50%}
+$$ \sigma(x) = \frac{1}{1 + e^{-x}} $$
+\vspace{3ex}
+
+* Saturated neurons “kill” the gradients
+* Sigmoid outputs are not zero-centered
+* exp() is a bit compute expensive
+::::
+:::
+
+## Activation functions
+\begin{figure}
+\centering
+\includegraphics[width=\textwidth]{figures/activation_functions.png}
+\end{figure}
+
+## ReLU
+::: columns
+:::: {.column width=50%}
+\includegraphics[width=.75\textwidth]{figures/relu.png}
+::::
+:::: {.column width=50%}
+$$ f(x) = \max(0,x) $$
+\vspace{1ex}
+
+* Does not saturate (in +region)
+* Very computationally efficient
+* Converges much faster than sigmoid tanh in practice
+* Actually more biologically plausible than sigmoid
+* But: gradient vanishes for $x < 0$
+
+::::
+:::
+
+
+## Bias-variance tradeoff (1)
+
+Goal: generalization of training data
+
+* Simple models (few parameters): danger of bias
+	* \textcolor{gray}{Classifiers with a small number of degrees of freedom are less prone to statistical fluctuations: different training samples would result in similar classification boundaries ("small variance")}
+* Complex models (many parameters): danger of overfitting
+	* \textcolor{gray}{large variance of decision boundaries for different training samples}
+
+## Bias-variance tradeoff (2)
+\begin{figure}
+\centering
+\includegraphics[trim=4cm 0cm 4cm 0cm, width=\textwidth]{figures/underfitting_overfitting.pdf}
+\end{figure}
+
+## Example of overtraining
+Too many neurons/layers make a neural network too flexible \newline $\to$ overtraining
+
+\begin{figure}
+\centering
+\includegraphics[width=0.9\textwidth]{figures/example_overtraining.png}
+\end{figure}
+
+## Monitoring overtraining
+Monitor fraction of misclassified events (or loss function:) 
+\begin{figure}
+\centering
+\includegraphics[width=0.8\textwidth]{figures/monitoring_overtraining.png}
+\end{figure}
+
+## Regularization: Avoid overfitting
+\scriptsize
+[\hfill \textcolor{gray}{http://cs231n.stanford.edu/slides}](http://cs231n.stanford.edu/slides)
+\normalsize
+\begin{figure}
+\centering
+\includegraphics[width=0.75\textwidth]{figures/regularization.png}
+\end{figure}
+\begin{center}
+$L_1$ regularization: $R(W) = \sum_k |W_k|$, $L_2$ regularization: $R(W) = \sum_k W_k^2$
+\end{center}
+
+## Another approach to prevent overfitting: Dropout
+* Randomly remove nodes during training
+* Avoid co-adaptation of nodes
+\begin{figure}
+\centering
+\includegraphics[width=0.8\textwidth]{figures/dropout.png}
+\end{figure}
+\scriptsize
+\textcolor{gray}{Srivastava et al.,}
+[\textcolor{gray}{"Dropout: A Simple Way to Prevent Neural Networks from Overfitting"}](jmlr.org/papers/volume15/srivastava14a.old/srivastava14a.pdf)
+\normalsize
+
+ 
+
+## Pros and cons of multi-layer perceptrons
+
+\textcolor{green}{Pros}
+
+* Capability to learn non-linear models
+
+\vspace{3ex}
+
+\textcolor{red}{Cons} 
+
+* Loss function can have several local minima
+* Hyperparameters need to be tuned
+	* \textcolor{gray}{number of layers, neurons per layer, and training iterations}
+* Sensitive to feature scaling
+	* \textcolor{gray}{preprocessing needed (e.g., scaling of all feature to range [0,1])}
+
+
+## Example 1: Boston house prices (MLP regression) (1)
+* Objective: predict house prices in Boston suburbs in the mid-1970s
+* Boston house data set: 506 instances, 13 features
+
+\footnotesize
+```
+    - CRIM     per capita crime rate by town
+    - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
+    - INDUS    proportion of non-retail business acres per town
+    - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
+    - NOX      nitric oxides concentration (parts per 10 million)
+    - RM       average number of rooms per dwelling
+    - AGE      proportion of owner-occupied units built prior to 1940
+    - DIS      weighted distances to five Boston employment centres
+    - RAD      index of accessibility to radial highways
+    - TAX      full-value property-tax rate per $10,000
+    - PTRATIO  pupil-teacher ratio by town
+    - B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
+    - LSTAT    % lower status of the population
+    - MEDV     Median value of owner-occupied homes in $1000's
+```
+
+\footnotesize 
+[\textcolor{gray}{05\_neural\_networks\_boston\_house\_prices.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/05_neural_networks_boston_house_prices.ipynb)
+
+## Example 1: Boston house prices (MLP regression) (2)
+```python
+boston = datasets.load_boston()
+X = boston.data
+y = boston.target
+
+from sklearn.neural_network import MLPRegressor
+mlp = MLPRegressor(hidden_layer_sizes=(100), 
+	activation='logistic', random_state=1, max_iter=5000)
+mlp.fit(X_train, y_train)
+
+y_pred_mlp = mlp.predict(X_test)
+
+rms = np.sqrt(mean_squared_error(y_test, y_pred_mlp))
+print(f"root mean square error {rms:.2f}")
+```
+
+## Example 1: Boston house prices (MLP regression) (3)
+\begin{center}
+\includegraphics[width=0.7\textwidth]{figures/boston_house_prices.pdf}
+\end{center}
+
+## Exercise 1: XOR
+\small
+[\textcolor{gray}{05\_neural\_networks\_ex\_1\_xor.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_1_xor.ipynb)
+\normalsize
+
+::: columns
+:::: {.column width=60%}
+a) Define a multi-layer perceptron classifier that learns the XOR problem.
+\scriptsize
+```python
+	from sklearn.neural_network import MLPClassifier
+
+	X = [[0, 0], [0, 1], [1, 0], [1, 1]]
+	y = [0, 1, 1, 0]
+```
+\normalsize
+b) Define a multi-layer perceptron regressor that fits the depicted 2d data (see notebook).
+
+c) Plot the mean square error vs. the number of number of training epochs for b).
+::::
+:::: {.column width=40%}
+\vspace{10ex}
+![](figures/xor_like_data.pdf)
+::::
+:::
+
+## Exercise 2: Visualising decision boundaries of classifiers
+
+\small
+[\textcolor{gray}{05\_neural\_networks\_ex\_2\_decision\_boundaries.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_2_decision_boundaries.ipynb)
+\normalsize
+
+\vspace{5ex}
+
+Visualize the decision boundaries of a scikit-learn decision tree, a scikit-learn multi-layer perceptron, and XGBoost for different toy data sets.
+
+
+## Exercise 3: Boston house prices (hyperparameter optimization)
+
+\small
+[\textcolor{gray}{05\_neural\_networks\_ex\_3\_boston\_house\_prices.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_3_boston_house_prices.ipynb)
+\normalsize
+
+\vspace{5ex}
+
+a) Can you find better hyperparameters (number of hidden layers, neurons per layer, loss function, ...)? Try this first by hand.
+b) Now use [\textcolor{gray}{sklearn.model\_selection.GridSearchCV}](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) to find optimal parameters.
+
+## TensorFlow
+
+::: columns
+:::: {.column width=70%}
+
+* Powerful open source library with a focus on deep neural networks
+* Performs computations of data flow graphs
+* Takes care of computing gradients of the defined functions (\textit{automatic differentiation})
+* Computations in parallel on multiple CPUs or GPUs
+* Developed by the Google Brain team
+* Initial release in 2015
+* [https://www.tensorflow.org/](https://www.tensorflow.org/)
+
+::::
+:::: {.column width=30%}
+\begin{center}
+\includegraphics[width=0.7\textwidth]{figures/tensorflow.png}
+\end{center}
+::::
+:::
+
+## Keras
+
+::: columns
+:::: {.column width=70%}
+
+* Open-source library providing high-level building blocks for developing deep-learning models  
+* Uses TensorFlow as \textit{backend engine} for low-level tensor manipulation (version 2.4)
+* Part of TensorFlow core API since TensorFlow 1.4 release
+* Over 375,000 individual users as of early-2020
+* Primary author: Fran\c{c}ois Chollet (Google engineer)
+* [https://keras.io/](https://keras.io/)
+
+::::
+:::: {.column width=30%}
+\begin{center}
+\includegraphics[width=0.5\textwidth]{figures/keras.png}
+\end{center}
+::::
+:::
+
+
+
+## Example 2: Boston house prices with Keras
+
+\small
+```python
+from tensorflow.keras import models
+from tensorflow.keras import layers
+
+model = models.Sequential()
+model.add(layers.Dense(64, activation='relu',
+                       input_shape=(train_data.shape[1],)))
+model.add(layers.Dense(64, activation='relu'))
+model.add(layers.Dense(1))
+model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
+
+model.fit(partial_train_data, partial_train_targets,
+              epochs=num_epochs, batch_size=1, verbose=0)
+
+# Evaluate the model on the validation data
+val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=0)
+
+```
+\normalsize
+
+\footnotesize 
+[\textcolor{gray}{05\_neural\_networks\_boston\_keras.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/05_neural_networks_boston_keras.ipynb)
+
+## Convolutional neutral networks (CNNs)
+\begin{center}
+\includegraphics[width=0.7\textwidth]{figures/cnn.png}
+\end{center}
+::: columns
+:::: {.column width=80%}
+* CNNs emerged from the study of the visual cortex
+* Behind many deep learning successes
+* Partially connected layers
+ 	* \textcolor{gray}{Fully connected layers impractical for large images (too many neurons, overfitting)}
+ * Key component: Convolutional layers
+ 	* \textcolor{gray}{Set of learnable filters}
+ 	* \textcolor{gray}{Low-level features at the first layers; high-level features a the end}
+::::
+:::: {.column width=20%}
+\small
+\textcolor{gray}{Sliding $3 \times3$ filter}
+![](figures/cnn_sliding_filter.png)
+::::
+:::
+
+## Different types of layers in a CNN
+::: columns
+:::: {.column width=50%}
+\small \textcolor{gray}{1. Convolutional layers} \newline
+\includegraphics[width=0.9\textwidth]{figures/cnn_conv_layer.png} 
+::::
+:::: {.column width=50%}
+\small \textcolor{gray}{3. Fully connected layers} \newline
+\includegraphics[width=0.9\textwidth]{figures/cnn_fully_connected.png}
+::::
+:::
+
+\vspace{3ex}
+
+::: columns
+:::: {.column width=60%}
+\vfill
+\small \textcolor{gray}{2. Pooling layers} \newline
+\includegraphics[width=\textwidth]{figures/cnn_pooling.png}
+::::
+:::: {.column width=40%}
+\textcolor{gray}{\footnotesize Afshine Amidi, Shervine Amidi} \
+[\textcolor{gray}{\footnotesize Convolutional Neural Networks cheatsheet}](https://github.com/afshinea/stanford-cs-230-deep-learning/blob/master/en/cheatsheet-convolutional-neural-networks.pdf)
+::::
+:::
+
+## MNIST classification with a CNN in Keras
+\footnotesize
+```python
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Flatten, MaxPooling2D, Conv2D, Input
+
+# conv layer with 8 3x3 filters
+model = Sequential(
+    [
+        Input(shape=input_shape),
+        Conv2D(8, kernel_size=(3, 3), activation="relu"),
+        MaxPooling2D(pool_size=(2, 2)),
+        Flatten(),
+        Dense(16, activation="relu"),
+        Dense(num_classes, activation="softmax"),
+    ]
+)
+
+model.summary()
+```
+\normalsize
+
+## Defining the CNN in Keras (2)
+
+\footnotesize
+```
+Model: "sequential_1"
+_________________________________________________________________
+Layer (type)                 Output Shape              Param #   
+=================================================================
+conv2d_1 (Conv2D)            (None, 26, 26, 8)         80        
+_________________________________________________________________
+max_pooling2d_1 (MaxPooling2 (None, 13, 13, 8)         0         
+_________________________________________________________________
+flatten_1 (Flatten)          (None, 1352)              0         
+_________________________________________________________________
+dense_2 (Dense)              (None, 16)                21648     
+_________________________________________________________________
+dense_3 (Dense)              (None, 10)                170       
+=================================================================
+Total params: 21,898
+Trainable params: 21,898
+Non-trainable params: 0
+```
+\normalsize
+
+## Model definition
+Using Keras, you have to `compile` a model, which means adding the loss function, the optimizer algorithm and validation metrics to your training setup.
+\vspace{5ex}
+
+\footnotesize
+```python
+model.compile(loss="categorical_crossentropy",
+        optimizer="adam",
+        metrics=["accuracy"])
+```
+\normalsize
+
+## Model training
+
+\footnotesize
+```python
+from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
+
+checkpoint = ModelCheckpoint(
+            filepath="mnist_keras_model.h5",
+            save_best_only=True,
+            verbose=1)
+early_stopping = EarlyStopping(patience=2)
+
+history = model.fit(x_train, y_train, # Training data
+            batch_size=200, # Batch size
+            epochs=50, # Maximum number of training epochs
+            validation_split=0.5, # Use 50% of the train dataset for validation
+            callbacks=[checkpoint, early_stopping]) # Register callbacks
+```
+\normalsize
+
+## Exercise 4: Training a digit-classification neural network on the MNIST dataset using Keras
+
+\small
+[\textcolor{gray}{05\_neural\_networks\_ex\_4\_mnist\_keras\_train.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_4_mnist_keras_train.ipynb)
+\normalsize
+
+\vspace{5ex}
+
+a) Plot training and validation loss as well as training and validation accuracy as a function of the number of epochs
+
+b) Determine the accuracy of the fully trained model.
+
+c) Create a second notebook that reads the trained model (`mnist_keras_model.h5`). Read `your_own_digit.png` and classify it. Create your own $28 \times 28$ pixel digits with a program like gimp and check how the model performs. 
+
+<!--
+## Exercise 5: Higgs data set (1)
+
+Application of deep neural networks for separation of signal and background in an exotic Higgs scenario
+
+\vfill
+
+\small
+\color{gray}
+In this exercise we want to explore various techniques to optimize the event selection in the search for supersymmetric Higgs bosons at the LHC. In supersymmetry the Higgs sector constitutes of five Higgs bosons in contrast to the single Higgs in the standard model. Here we deal with a heavy Higgs boson which decays into two W-bosons and a standard Higgs boson ($H^0 \to W^+ W^- h$) which decay further into leptons ($W^\pm \to l^\pm \nu$) and b-quarks ($h\to b \bar{b}$) respectively.
+
+This exercise is based on a [Nature paper](https://www.nature.com/articles/ncomms5308) (Pierre Baldi, Peter Sadowski, Daniel Whiteson) which contains much more information like general background information, details about the selection variables and links to large sets of simulated events. You might also use the paper as inspiration for the solution of this exercise.
+
+## Exercise 5: Higgs data set (2)
+
+The two dataset consists of 10k and 100k events respectively. For each event 29 variables are stored:
+
+\footnotesize
+```
+    0: classification (1 = signal, 0 = background) 
+    1 - 21 : low level quantities (var1 - var21)
+    22 -28 : high level quantities (var22 - var28)
+```
+
+\normalsize
+
+You can read the data as follows:
+
+\scriptsize
+```python
+#filename = "https://www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/data/HIGGS_10k.csv" 
+filename = "https://www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/data/HIGGS_100k.csv"  
+
+df = pd.read_csv(filename, engine='python')
+```
+
+\normalsize
+a) Use a classifier of you choice to separate signal and background events. Determine the accuracy score.
+b) Compare the results when using i) the low level quantities and ii) the high level quantities
+-->
+
+## Practical advice -- Which algorithm to choose?
+\textcolor{gray}{From Kaggle competitions:}
+
+\vspace{3ex}
+Structured data: "High level" features that have meaning:
+
+* feature engineering + decision trees
+* Random forests
+* XGBoost
+
+\vspace{3ex}
+Unstructured data: "Low level" features, no individual meaning:
+
+* deep neural networks
+* e.g. image classification: convolutional NN
+
+
+## Outlook: Autoencoders
+
+::: columns
+:::: {.column width=50%}
+* Unsupervised method based on neural networks to learn a representation of the input data
+* Autoencoders learn to copy the input to the output layer
+	* low dimensional coding of the input in the central layer
+* The decoder generates data based on the coding (*generative model*)
+* Applications
+	* Dimensionality reduction
+	* Denoising of data
+	* Machine translation
+::::
+:::: {.column width=50%}
+\vspace{3ex}
+\begin{center}
+\includegraphics[width=\textwidth]{figures/autoencoder_example.pdf}
+\end{center}
+::::
+:::
+
+## Outlook: Generative adversarial network (GANs)
+
+\begin{center}
+\includegraphics[width=0.65\textwidth]{figures/gan.png}
+\end{center}
+\scriptsize
+[\textcolor{gray}{https://developers.google.com/machine-learning/gan/gan\_structure}](https://developers.google.com/machine-learning/gan/gan_structure)
+\normalsize
+
+* Discriminator's classification provides a signal that the generator uses to update its weights
+* Application in particle physics: fast detector simulation
+* 	Full GEANT simulation usually very CPU intensive
+
+## The future
+
+"Das Interessante an unserer Intelligenz ist, dass wir Go spielen können und dann vom Tisch aufstehen und Essen machen können, was eine Maschine nicht kann."
+
+\vspace{2ex}
+
+\color{gray}
+\small
+\hfill Bernhard Schölkopf, Max-Planck-Institut für intelligente Systeme ([Interview FAZ](https://www.faz.net/aktuell/wirtschaft/kuenstliche-intelligenz/ki-fachmann-wie-gut-europa-in-der-forschung-aufgestellt-ist-16650700.html))
+\normalsize
+\color{black}
+
+\vfill
+
+"My view is throw it all away and start again"
+
+\color{gray}
+\small
+\hfill Geoffrey Hinton (DNN pioneer) on deep neural networks and backpropagation ([Interview, 2017](https://www.axios.com/artificial-intelligence-pioneer-says-we-need-to-start-over-1513305524-f619efbd-9db0-4947-a9b2-7a4c310a28fe.html))
+\normalsize
+\color{black}
+
+
diff --git a/slides/run_pandoc.sh b/slides/run_pandoc.sh
new file mode 100755
index 0000000..5be9eea
--- /dev/null
+++ b/slides/run_pandoc.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+# run pandoc for a specific file
+
+pandoc --pdf-engine=xelatex --variable mainfont="Helvetica" --variable sansfont="Helvetica" -t beamer -s -fmarkdown-implicit_figures --template=template.beamer 05_neural_networks.md -o neural_networks.pdf
diff --git a/slides/template.beamer b/slides/template.beamer
new file mode 100644
index 0000000..c60ca9c
--- /dev/null
+++ b/slides/template.beamer
@@ -0,0 +1,259 @@
+\documentclass[aspectratio=169,$if(fontsize)$$fontsize$,$endif$$if(lang)$$babel-lang$,$endif$$if(handout)$handout,$endif$$if(beamer)$ignorenonframetext,$endif$$for(classoption)$$classoption$$sep$,$endfor$]{$documentclass$}
+\setbeamertemplate{caption}[numbered]
+\setbeamertemplate{caption label separator}{: }
+\setbeamertemplate{itemize item}[circle]
+% \setbeamertemplate{itemize item}{\raisebox{0.1em}{\scalebox{0.6}{$$\blacksquare$$}}} 
+\setbeamertemplate{itemize subitem}{\raisebox{0.2em}{\scalebox{.7}{$$\blacktriangleright$$}}}
+\setbeamercolor{caption name}{fg=normal text.fg}
+\beamertemplatenavigationsymbols$if(navigation)$$navigation$$else$empty$endif$
+$if(fontfamily)$
+\usepackage[$for(fontfamilyoptions)$$fontfamilyoptions$$sep$,$endfor$]{$fontfamily$}
+$else$
+\usepackage{lmodern}
+$endif$
+\usepackage{amssymb,amsmath}
+\usepackage{ifxetex,ifluatex}
+\usepackage{fixltx2e} % provides \textsubscript
+\usepackage{amsbsy}
+\usepackage{bm}
+\renewcommand*{\vec}[1]{\bm{#1}}
+\newcommand*{\mat}[1]{\bm{#1}}
+\setbeamertemplate{footline}{\hspace{155mm}\insertframenumber\vspace{1mm}\hspace{10mm}}
+\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
+  \usepackage[$if(fontenc)$$fontenc$$else$T1$endif$]{fontenc}
+  \usepackage[utf8]{inputenc}
+$if(euro)$
+  \usepackage{eurosym}
+$endif$
+\else % if luatex or xelatex
+  \ifxetex
+    \usepackage{mathspec}
+  \else
+    \usepackage{fontspec}
+  \fi
+  \defaultfontfeatures{Ligatures=TeX,Scale=MatchUppercase}
+$if(euro)$
+  \newcommand{\euro}{€}
+$endif$
+$if(mainfont)$
+    \setmainfont[$for(mainfontoptions)$$mainfontoptions$$sep$,$endfor$]{$mainfont$}
+$endif$
+$if(sansfont)$
+    \setsansfont[$for(sansfontoptions)$$sansfontoptions$$sep$,$endfor$]{$sansfont$}
+$endif$
+$if(monofont)$
+    \setmonofont[Mapping=tex-ansi$if(monofontoptions)$,$for(monofontoptions)$$monofontoptions$$sep$,$endfor$$endif$]{$monofont$}
+$endif$
+$if(mathfont)$
+    \setmathfont(Digits,Latin,Greek)[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$}
+$endif$
+$if(CJKmainfont)$
+    \usepackage{xeCJK}
+    \setCJKmainfont[$for(CJKoptions)$$CJKoptions$$sep$,$endfor$]{$CJKmainfont$}
+$endif$
+\fi
+$if(theme)$
+\usetheme{$theme$}
+$endif$
+$if(colortheme)$
+\usecolortheme{$colortheme$}
+$endif$
+$if(fonttheme)$
+\usefonttheme{$fonttheme$}
+$endif$
+$if(mainfont)$
+\usefonttheme{serif} % use mainfont rather than sansfont for slide text
+$endif$
+$if(innertheme)$
+\useinnertheme{$innertheme$}
+$endif$
+$if(outertheme)$
+\useoutertheme{$outertheme$}
+$endif$
+% use upquote if available, for straight quotes in verbatim environments
+\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
+% use microtype if available
+\IfFileExists{microtype.sty}{%
+\usepackage{microtype}
+\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
+}{}
+$if(lang)$
+\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
+  \usepackage[shorthands=off,$for(babel-otherlangs)$$babel-otherlangs$,$endfor$main=$babel-lang$]{babel}
+$if(babel-newcommands)$
+  $babel-newcommands$
+$endif$
+\else
+  \usepackage{polyglossia}
+  \setmainlanguage[$polyglossia-lang.options$]{$polyglossia-lang.name$}
+$for(polyglossia-otherlangs)$
+  \setotherlanguage[$polyglossia-otherlangs.options$]{$polyglossia-otherlangs.name$}
+$endfor$
+\fi
+$endif$
+\newif\ifbibliography
+$if(natbib)$
+\usepackage{natbib}
+\bibliographystyle{$if(biblio-style)$$biblio-style$$else$plainnat$endif$}
+$endif$
+$if(biblatex)$
+\usepackage[$if(biblio-style)$style=$biblio-style$,$endif$$for(biblatexoptions)$$biblatexoptions$$sep$,$endfor$]{biblatex}
+$for(bibliography)$
+\addbibresource{$bibliography$}
+$endfor$
+$endif$
+$if(listings)$
+\usepackage{listings}
+$endif$
+$if(lhs)$
+\lstnewenvironment{code}{\lstset{language=Haskell,basicstyle=\small\ttfamily}}{}
+$endif$
+$if(highlighting-macros)$
+$highlighting-macros$
+$endif$
+$if(verbatim-in-note)$
+\usepackage{fancyvrb}
+\VerbatimFootnotes % allows verbatim text in footnotes
+$endif$
+$if(tables)$
+\usepackage{longtable,booktabs}
+\usepackage{caption}
+% These lines are needed to make table captions work with longtable:
+\makeatletter
+\def\fnum@table{\tablename~\thetable}
+\makeatother
+$endif$
+$if(graphics)$
+\usepackage{graphicx,grffile}
+\makeatletter
+\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
+\def\maxheight{\ifdim\Gin@nat@height>\textheight0.8\textheight\else\Gin@nat@height\fi}
+\makeatother
+% Scale images if necessary, so that they will not overflow the page
+% margins by default, and it is still possible to overwrite the defaults
+% using explicit options in \includegraphics[width, height, ...]{}
+\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
+$endif$
+
+% Prevent slide breaks in the middle of a paragraph:
+\widowpenalties 1 10000
+\raggedbottom
+
+$if(section-titles)$
+\AtBeginPart{
+  \let\insertpartnumber\relax
+  \let\partname\relax
+  \frame{\partpage}
+}
+\AtBeginSection{
+  \ifbibliography
+  \else
+    \let\insertsectionnumber\relax
+    \let\sectionname\relax
+    \frame{\sectionpage}
+  \fi
+}
+\AtBeginSubsection{
+  \let\insertsubsectionnumber\relax
+  \let\subsectionname\relax
+  \frame{\subsectionpage}
+}
+$endif$
+
+$if(links-as-notes)$
+% Make links footnotes instead of hotlinks:
+\renewcommand{\href}[2]{#2\footnote{\url{#1}}}
+$endif$
+$if(strikeout)$
+\usepackage[normalem]{ulem}
+% avoid problems with \sout in headers with hyperref:
+\pdfstringdefDisableCommands{\renewcommand{\sout}{}}
+$endif$
+\setlength{\emergencystretch}{3em}  % prevent overfull lines
+\providecommand{\tightlist}{%
+  \setlength{\itemsep}{1ex}\setlength{\parskip}{0pt}}
+$if(numbersections)$
+\setcounter{secnumdepth}{5}
+$else$
+\setcounter{secnumdepth}{0}
+$endif$
+$if(dir)$
+\ifxetex
+  % load bidi as late as possible as it modifies e.g. graphicx
+  $if(latex-dir-rtl)$
+  \usepackage[RTLdocument]{bidi}
+  $else$
+  \usepackage{bidi}
+  $endif$
+\fi
+\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex
+  \TeXXeTstate=1
+  \newcommand{\RL}[1]{\beginR #1\endR}
+  \newcommand{\LR}[1]{\beginL #1\endL}
+  \newenvironment{RTL}{\beginR}{\endR}
+  \newenvironment{LTR}{\beginL}{\endL}
+\fi
+$endif$
+$for(header-includes)$
+$header-includes$
+$endfor$
+
+$if(title)$
+\title{$title$}
+$endif$
+$if(subtitle)$
+\subtitle{$subtitle$}
+$endif$
+$if(author)$
+\author{$for(author)$$author$$sep$ \and $endfor$}
+$endif$
+$if(institute)$
+\institute{$for(institute)$$institute$$sep$ \and $endfor$}
+$endif$
+\date{$date$}
+
+\begin{document}
+$if(title)$
+\frame{\titlepage}
+$endif$
+
+$for(include-before)$
+$include-before$
+
+$endfor$
+$if(toc)$
+\begin{frame}
+\tableofcontents[hideallsubsections]
+\end{frame}
+
+$endif$
+$body$
+
+$if(natbib)$
+$if(bibliography)$
+$if(biblio-title)$
+$if(book-class)$
+\renewcommand\bibname{$biblio-title$}
+$else$
+\renewcommand\refname{$biblio-title$}
+$endif$
+$endif$
+\begin{frame}[allowframebreaks]{$biblio-title$}
+\bibliographytrue
+\bibliography{$for(bibliography)$$bibliography$$sep$,$endfor$}
+\end{frame}
+
+$endif$
+$endif$
+$if(biblatex)$
+\begin{frame}[allowframebreaks]{$biblio-title$}
+\bibliographytrue
+\printbibliography[heading=none]
+\end{frame}
+
+$endif$
+$for(include-after)$
+$include-after$
+
+$endfor$
+\end{document}