diff --git a/slides/.gitignore b/slides/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/slides/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/slides/Makefile b/slides/Makefile new file mode 100644 index 0000000..e3db178 --- /dev/null +++ b/slides/Makefile @@ -0,0 +1,10 @@ +# make creates pdf files of all newly edited .md files + +SRCS := $(wildcard *.md) +PDF := $(SRCS:%.md=%.pdf) + +OPT := --pdf-engine=xelatex --variable mainfont="Helvetica" --variable sansfont="Helvetica" -t beamer -s -fmarkdown-implicit_figures --template=template.beamer --highlight-style=kate +all: ${PDF} + +%.pdf: %.md + pandoc $(OPT) --output=$@ $< diff --git a/slides/README.md b/slides/README.md new file mode 100644 index 0000000..c22dd99 --- /dev/null +++ b/slides/README.md @@ -0,0 +1,2 @@ +Pandoc slides example following style of [Stefan Wunsch's CERN IML workhsop presenation](https://github.com/stwunsch/iml_keras_workshop) on [keras](https://keras.io/) (see slides folder) + diff --git a/slides/copy_slides.sh b/slides/copy_slides.sh new file mode 100755 index 0000000..89a426e --- /dev/null +++ b/slides/copy_slides.sh @@ -0,0 +1,6 @@ +# slides (do chgrp machlearn later) +# scp CIPpoolAccess.PDF reygers@rho0:public_html/lectures/2021/ml/transparencies/ +# scp 03_ml_basics.pdf reygers@rho0:public_html/lectures/2021/ml/transparencies/ +# scp 04_decision_trees.pdf reygers@rho0:public_html/lectures/2021/ml/transparencies/ +scp 05_neural_networks.pdf reygers@rho0:public_html/lectures/2021/ml/transparencies/ + diff --git a/slides/decision_trees.md b/slides/decision_trees.md new file mode 100644 index 0000000..06817cd --- /dev/null +++ b/slides/decision_trees.md @@ -0,0 +1,347 @@ +--- +title: | + | Introduction to Data Analysis and Machine Learning in Physics: + | 4. Decisions Trees + +author: "Martino Borsato, Jörg Marks, Klaus Reygers" +date: "Studierendentage, 11-14 April 2022" +--- +## Exercises + +* Exercise 1: Compare different decision tree classifiers + * [`04_decision_trees_ex_1_compare_tree_classifiers.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/04_decision_trees_ex_1_compare_tree_classifiers.ipynb) +* Exercise 2: Apply XGBoost classifier to MAGIC data set + * [`04_decision_trees_ex_2_magic_xgboost_and_random_forest.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/04_decision_trees_ex_2_magic_xgboost_and_random_forest.ipynb) +* Exercise 3: Feature importance +* Exercise 4: Interpret a classifier with SHAP values + +## Decision trees + +\begin{figure} +\centering +\includegraphics[width=0.85\textwidth]{figures/mini_boone_decisions_tree.png} +\end{figure} + +\begin{center} +Leaf nodes classify events as either signal or background +\end{center} + +## Decision trees: Rectangular volumes in feature space + +\begin{figure} +\centering +\includegraphics[width=0.75\textwidth]{figures/decision_trees_feature_space.png} +\end{figure} + +* Easy to interpret and visualize: Space of feature vectors split up into rectangular volumes (attributed to either signal or background) +* How to build a decision tree in an optimal way? + +## Finding optimal cuts + +Separation btw. signal and background is often measured with the Gini index (or Gini impurity): + +$$ G = p (1-p) $$ + +Here $p$ is the purity: +$$ p = \frac{\sum_\mathrm{signal} w_i}{\sum_\mathrm{signal} w_i + \sum_\mathrm{background} w_i}, \quad w_i = \text{weight of event}\; i$$ + +\vfill +\textcolor{gray}{Usefulness of weights will become apparent soon.} + +\vfill +Improvement in signal/background separation after splitting a set A into two sets B and C: +$$ \Delta = W_A G_A - W_B G_B - W_C G_C \quad \text{where} \quad W_X = \sum_{X} w_i $$ + +## Gini impurity and other purity measures +\begin{figure} +\centering +\includegraphics[width=0.7\textwidth]{figures/signal_purity.png} +\end{figure} + + +## Decision tree pruning + +::: columns +:::: {.column width=50%} + +When to stop growing a tree? + +* When all nodes are essentially pure? +* Well, that's overfitting! + +\vspace{3ex} + +Pruning + +* Cut back fully grown tree to avoid overtraining, i.e., replace nodes and subtrees by leaves + +:::: +:::: {.column width=50%} +\begin{figure} +\centering +\includegraphics[width=0.85\textwidth]{figures/tree_pruning_slides.png} +\end{figure} +:::: +::: + +## Single decision trees: Pros and cons + +\textcolor{green}{Pros:} + +* Requires little data preparation (unlike neural networks) +* Can use continuous and categorical inputs + +\vfill + +\textcolor{red}{Cons:} + +* Danger of overfitting training data +* Sensitive to fluctuations in the training data +* Hard to find global optimum +* When to stop splitting? + +## Ensemble methods: Combine weak learners + +::: columns +:::: {.column width=70%} +* Bootstrap Aggregating (Bagging) + * Sample training data (with replacement) and train a separate model on each of the derived training sets + * Classify example with majority vote, or compute average output from each tree as model output + +:::: +:::: {.column width=30%} +$$ y(\vec x) = \frac{1}{N_\mathrm{trees}} \sum_{i=1}^{N_{trees}} y_i(\vec x) $$ +:::: +::: +\vfill +::: columns +:::: {.column width=70%} +* Boosting + * Train $N$ models in sequence, giving more weight to examples not correctly classified by previous model + * Take weighted average to classify examples + +:::: +:::: {.column width=30%} +$$ y(\vec x) = \frac{\sum_{i=1}^{N_\mathrm{trees}} \alpha_i y_i(\vec x)}{\sum_{i=1}^{N_\mathrm{trees}} \alpha_i} $$ +:::: +::: + +## Random forests + +* "One of the most widely used and versatile algorithms in data science and machine learning" +\tiny \textcolor{gray}{arXiv:1803.08823v3} \normalsize +\vfill +* Use bagging to select random example subset +\vfill +* Train a tree, but only use random subset of features at each split + * this reduces the correlation between different trees + * makes the decision more robust to missing data + +## Boosted decision trees: Idea + +\begin{figure} +\centering +\includegraphics[width=0.75\textwidth]{figures/bdt.png} +\end{figure} + +## AdaBoost (short for Adaptive Boosting) + +Initial training sample + +\begin{center} +\begin{tabular}{l l} +$\vec x_1, ..., \vec x_n$: & multivariate event data \\ +$y_1, ..., y_n$: & true class labels, $+1$ or $-1$ \\ +$w_1^{(1)}, ..., w_n^{(1)}$ & event weights +\end{tabular} +\end{center} + +with equal weights normalized as + +$$ \sum_{i=1}^n w_i^{(1)} = 1 $$ + +Train first classifier $f_1$: + +\begin{center} +\begin{tabular}{l l} +$f_1(\vec x_i) > 0$ & classify as signal \\ +$f_1(\vec x_i) < 0$ & classify as background +\end{tabular} +\end{center} + +## AdaBoost: Updating events weights + +Define training sample $k+1$ from training sample $k$ by updating weights: + +$$ w_i^{(k+1)} = w_i^{(k)} \frac{e^{- \alpha_k f_k(\vec x_i) y_i/2}}{Z_k} $$ + +\footnotesize +\textcolor{gray}{$$ i = \text{event index}, \quad Z_k:\; \text{normalization factor so that } \sum_{i=1}^n w_i^{(k)} = 1$$} +\normalsize + +Weight is increased if event was misclassified by the previous classifier + +$\to$ "Next classifier should pay more attention to misclassified events" + + +\vfill +At each step the classifier $f_k$ minimizes error rate: + +$$ \varepsilon_k = \sum_{i=1}^n w_i^{(k)} I(y_i f_k( \vec x_i) \le 0), +\quad I(X) = 1 \; \text{if} \; X \; \text{is true, 0 otherwise} $$ + +## AdaBoost: Assigning the classifier score + +Assign score to each classifier according to its error rate: +$$ \alpha_k = \ln \frac{1 - \varepsilon_k}{\varepsilon_k} $$ + +\vfill + +Combined classifier (weighted average): +$$ f(\vec x) = \sum_{k=1}^K \alpha_k f_k(\vec x) $$ + + + +## Gradient boosting + +Basic idea: + +* Train a first decision tree +* Then train a second one on the residual errors made by the first tree +* And so on + +\vfill + +In slightly more detail: + +* \color{gray} Consider labeled training data: $\{\vec x_i, y_i\}$ +* Model prediction at iteration $m$: $F_m(\vec x_i)$ +* New model: $F_{m+1}(\vec x) = F_m(\vec x) + h_m(\vec x)$ +* Find $h_m(\vec x)$ by fitting it to +$\{(\vec x_1, y_1 - F_m(\vec x_1)), \; (\vec x_2, y_2 - F_m(\vec x_2)), \; ... \; (\vec x_n, y_n - F_m(\vec x_n)) \}$ + +\color{black} + +## Example 1: Predict critical temperature for superconductivty (Regression with XGBoost) (1) +\small +[\textcolor{gray}{04\_decision\_trees\_critical\_temp\_regression.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/04_decision_trees_critical_temp_regression.ipynb) +\normalsize + +\vfill + +Superconductivty data set: + +Predict the critical temperature based on 81 material features. +\footnotesize +[\textcolor{gray}{https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data}](https://archive.ics.uci.edu/ml/datasets/Superconductivty+Data) +\normalsize + +\vfill + +From the abstract: + + +We estimate a statistical model to predict the superconducting critical temperature based on the features extracted from the superconductor’s chemical formula. The statistical model gives reasonable out-of-sample predictions: ±9.5 K based on root-mean-squared-error. Features extracted based on thermal conductivity, atomic radius, valence, electron affinity, and atomic mass contribute the most to the model’s predictive accuracy. + +\vfill + +\tiny +[\textcolor{gray}{https://doi.org/10.1016/j.commatsci.2018.07.052}](https://doi.org/10.1016/j.commatsci.2018.07.052) +\normalsize + + +## Example 1: Predict critical temperature for superconductivty (Regression with XGBoost) (2) + +::: columns +:::: {.column width=60%} +\footnotesize +```python +import xgboost as xgb + +XGBreg = xgb.sklearn.XGBRegressor() + +XGBreg.fit(X_train, y_train) + +y_pred = XGBreg.predict(X_test) + +from sklearn.metrics import mean_squared_error +rms = np.sqrt(mean_squared_error(y_test, y_pred)) +print(f"root mean square error {rms:.2f}") +``` + +\textcolor{gray}{This gives:} + +`root mean square error 9.68` +:::: +:::: {.column width=40%} +\vspace{6ex} +![](figures/critical_temperature.pdf) +:::: +::: + +## Exercise 1: Compare different decision tree classifiers + +\small +[\textcolor{gray}{04\_decision\_trees\_ex\_1\_compare\_tree\_classifiers.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/04_decision_trees_ex_1_compare_tree_classifiers.ipynb) + +\vspace{5ex} + +Compare scikit-learns's `AdaBoostClassifier`, `RandomForestClassifier`, and `GradientBoostingClassifier` by plotting their ROC curves for the heart disease data set. \newline + +\vspace{2ex} + +Is there a classifier that clearly performs best? + + +## Exercise 2: Apply XGBoost classifier to MAGIC data set + +\small +[\textcolor{gray}{04\_decision\_trees\_ex\_2\_magic\_xgboost\_and\_random\_forest.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/04_decision_trees_ex_2_magic_xgboost_and_random_forest.ipynb) +\normalsize + +\footnotesize +```python +# train XGBoost boosted decision tree +import xgboost as xgb +XGBclassifier = xgb.sklearn.XGBClassifier(nthread=-1, seed=1, n_estimators=1000) +``` +\normalsize + +\small +a) Plot predicted probabilities for the test sample for signal and background events (\texttt{plt.hist}) +b) Which is the most important feature for discriminating signal and background according to XGBoost? \ +Hint: use plot_impartance from XGBoost (see [XGBoost plotting API](https://xgboost.readthedocs.io/en/latest/python/python_api.html)). Do you get the same answer for all three performance measures provided by XGBoost (“weight”, “gain”, or “cover”)? +c) Visualize one decision tree from the ensemble (let's say tree number 10). For this you need the the graphviz package (`pip3 install graphviz`) +d) Compare the performance of XGBoost with the [**random forest classifier**](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) from [**scikit learn**](https://scikit-learn.org/stable/index.html). Plot signal and background efficiency for both classifiers in one plot. Which classifier performs better? +\normalsize + + +## Exercise 3: Feature importance + +\small +[\textcolor{gray}{04\_decision\_trees\_ex\_3\_magic\_feature\_importance.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/04_decision_trees_ex_3_magic_feature_importance.ipynb) +\normalsize + +\vspace{3ex} + +Evaluate the importance of each of the $n$ features in the training of the XGBoost classifier for the MAGIC data set by dropping one of the features. This gives $n$ different classifiers. Compare the performance of these classifiers using the AUC score. + + +## Exercise 4: Interpret a classifier with SHAP values + +SHAP (SHapley Additive exPlanations) are a means to explain the output of any machine learning model. [Shapeley values](https://en.wikipedia.org/wiki/Shapley_value) are a concept that is used in cooperative game theory. They are named after Lloyd Shapley who won the Nobel Prize in Economics in 2012. + +\vfill + +Use the Python library [`SHAP`](https://shap.readthedocs.io/en/latest/index.html) to quantify the feature importance. + +a) Study the documentation at [https://shap.readthedocs.io/en/latest/tabular_examples.html](https://shap.readthedocs.io/en/latest/tabular_examples.html) + +b) Create a summary plot of the feature importance in the MAGIC data set with `shap.summary_plot` for the XGBoost classifier of exercise 2. What are the three most important features? + +c) Do the same for the superconductivity data set? What are the three most important features? + + + + + diff --git a/slides/figures/03_ml_basics_galton_linear_regression_iminuit.pdf b/slides/figures/03_ml_basics_galton_linear_regression_iminuit.pdf new file mode 100644 index 0000000..b555633 Binary files /dev/null and b/slides/figures/03_ml_basics_galton_linear_regression_iminuit.pdf differ diff --git a/slides/figures/03_ml_basics_log_regr_heart_disease.pdf b/slides/figures/03_ml_basics_log_regr_heart_disease.pdf new file mode 100644 index 0000000..a16e1d7 Binary files /dev/null and b/slides/figures/03_ml_basics_log_regr_heart_disease.pdf differ diff --git a/slides/figures/03_ml_basics_logistic_regression.pdf b/slides/figures/03_ml_basics_logistic_regression.pdf new file mode 100644 index 0000000..277c8fa Binary files /dev/null and b/slides/figures/03_ml_basics_logistic_regression.pdf differ diff --git a/slides/figures/L1vsL2.pdf b/slides/figures/L1vsL2.pdf new file mode 100644 index 0000000..dfdd9ae Binary files /dev/null and b/slides/figures/L1vsL2.pdf differ diff --git a/slides/figures/activation_functions.png b/slides/figures/activation_functions.png new file mode 100644 index 0000000..5dec32b Binary files /dev/null and b/slides/figures/activation_functions.png differ diff --git a/slides/figures/adversarial_attack.png b/slides/figures/adversarial_attack.png new file mode 100644 index 0000000..49bd32f Binary files /dev/null and b/slides/figures/adversarial_attack.png differ diff --git a/slides/figures/ai_history.png b/slides/figures/ai_history.png new file mode 100644 index 0000000..2febc77 Binary files /dev/null and b/slides/figures/ai_history.png differ diff --git a/slides/figures/ai_ml_dl.pdf b/slides/figures/ai_ml_dl.pdf new file mode 100644 index 0000000..03fd6d0 Binary files /dev/null and b/slides/figures/ai_ml_dl.pdf differ diff --git a/slides/figures/ann.png b/slides/figures/ann.png new file mode 100644 index 0000000..af3d34d Binary files /dev/null and b/slides/figures/ann.png differ diff --git a/slides/figures/anomaly_detection.png b/slides/figures/anomaly_detection.png new file mode 100644 index 0000000..d66fbb8 Binary files /dev/null and b/slides/figures/anomaly_detection.png differ diff --git a/slides/figures/autoencoder_example.pdf b/slides/figures/autoencoder_example.pdf new file mode 100644 index 0000000..f4b2407 Binary files /dev/null and b/slides/figures/autoencoder_example.pdf differ diff --git a/slides/figures/bdt.png b/slides/figures/bdt.png new file mode 100644 index 0000000..37e22e8 Binary files /dev/null and b/slides/figures/bdt.png differ diff --git a/slides/figures/book-murphy.png b/slides/figures/book-murphy.png new file mode 100644 index 0000000..63f8f61 Binary files /dev/null and b/slides/figures/book-murphy.png differ diff --git a/slides/figures/book_deep_learning_for_physics_research.png b/slides/figures/book_deep_learning_for_physics_research.png new file mode 100644 index 0000000..c1e706e Binary files /dev/null and b/slides/figures/book_deep_learning_for_physics_research.png differ diff --git a/slides/figures/boston_house_prices.pdf b/slides/figures/boston_house_prices.pdf new file mode 100644 index 0000000..68ad798 Binary files /dev/null and b/slides/figures/boston_house_prices.pdf differ diff --git a/slides/figures/cnn.png b/slides/figures/cnn.png new file mode 100644 index 0000000..e717e2c Binary files /dev/null and b/slides/figures/cnn.png differ diff --git a/slides/figures/cnn_conv_layer.png b/slides/figures/cnn_conv_layer.png new file mode 100644 index 0000000..b50382f Binary files /dev/null and b/slides/figures/cnn_conv_layer.png differ diff --git a/slides/figures/cnn_fully_connected.png b/slides/figures/cnn_fully_connected.png new file mode 100644 index 0000000..ec306f2 Binary files /dev/null and b/slides/figures/cnn_fully_connected.png differ diff --git a/slides/figures/cnn_pooling.png b/slides/figures/cnn_pooling.png new file mode 100644 index 0000000..7aa1ae4 Binary files /dev/null and b/slides/figures/cnn_pooling.png differ diff --git a/slides/figures/cnn_sliding_filter.png b/slides/figures/cnn_sliding_filter.png new file mode 100644 index 0000000..72855b9 Binary files /dev/null and b/slides/figures/cnn_sliding_filter.png differ diff --git a/slides/figures/critical_temperature.pdf b/slides/figures/critical_temperature.pdf new file mode 100644 index 0000000..b4c4a3b Binary files /dev/null and b/slides/figures/critical_temperature.pdf differ diff --git a/slides/figures/cross_val.png b/slides/figures/cross_val.png new file mode 100644 index 0000000..d1c81b9 Binary files /dev/null and b/slides/figures/cross_val.png differ diff --git a/slides/figures/decision_boundaries.png b/slides/figures/decision_boundaries.png new file mode 100644 index 0000000..25f2501 Binary files /dev/null and b/slides/figures/decision_boundaries.png differ diff --git a/slides/figures/decision_trees_feature_space.png b/slides/figures/decision_trees_feature_space.png new file mode 100644 index 0000000..4002331 Binary files /dev/null and b/slides/figures/decision_trees_feature_space.png differ diff --git a/slides/figures/deep_learning_book.png b/slides/figures/deep_learning_book.png new file mode 100644 index 0000000..cc9dcd8 Binary files /dev/null and b/slides/figures/deep_learning_book.png differ diff --git a/slides/figures/deep_learning_with_python.png b/slides/figures/deep_learning_with_python.png new file mode 100644 index 0000000..dc8aa2c Binary files /dev/null and b/slides/figures/deep_learning_with_python.png differ diff --git a/slides/figures/deepl.png b/slides/figures/deepl.png new file mode 100644 index 0000000..bf9e88f Binary files /dev/null and b/slides/figures/deepl.png differ diff --git a/slides/figures/dnn.png b/slides/figures/dnn.png new file mode 100644 index 0000000..67abadc Binary files /dev/null and b/slides/figures/dnn.png differ diff --git a/slides/figures/dropout.png b/slides/figures/dropout.png new file mode 100644 index 0000000..fca7610 Binary files /dev/null and b/slides/figures/dropout.png differ diff --git a/slides/figures/example_overtraining.png b/slides/figures/example_overtraining.png new file mode 100644 index 0000000..baf0a91 Binary files /dev/null and b/slides/figures/example_overtraining.png differ diff --git a/slides/figures/feature_transformation.png b/slides/figures/feature_transformation.png new file mode 100644 index 0000000..edbb7a5 Binary files /dev/null and b/slides/figures/feature_transformation.png differ diff --git a/slides/figures/fisher.png b/slides/figures/fisher.png new file mode 100644 index 0000000..e5a41e5 Binary files /dev/null and b/slides/figures/fisher.png differ diff --git a/slides/figures/fisher_linear_decision_boundary.png b/slides/figures/fisher_linear_decision_boundary.png new file mode 100644 index 0000000..0a527de Binary files /dev/null and b/slides/figures/fisher_linear_decision_boundary.png differ diff --git a/slides/figures/gan.png b/slides/figures/gan.png new file mode 100644 index 0000000..643c410 Binary files /dev/null and b/slides/figures/gan.png differ diff --git a/slides/figures/gradient_descent.png b/slides/figures/gradient_descent.png new file mode 100644 index 0000000..93eaad4 Binary files /dev/null and b/slides/figures/gradient_descent.png differ diff --git a/slides/figures/gradient_descent_cmp.png b/slides/figures/gradient_descent_cmp.png new file mode 100644 index 0000000..4ca0271 Binary files /dev/null and b/slides/figures/gradient_descent_cmp.png differ diff --git a/slides/figures/hands_on_machine_learning.png b/slides/figures/hands_on_machine_learning.png new file mode 100644 index 0000000..db9af4d Binary files /dev/null and b/slides/figures/hands_on_machine_learning.png differ diff --git a/slides/figures/handwritten_digits.png b/slides/figures/handwritten_digits.png new file mode 100644 index 0000000..bbcc538 Binary files /dev/null and b/slides/figures/handwritten_digits.png differ diff --git a/slides/figures/heart_table.png b/slides/figures/heart_table.png new file mode 100644 index 0000000..bdc59ce Binary files /dev/null and b/slides/figures/heart_table.png differ diff --git a/slides/figures/imagenet.png b/slides/figures/imagenet.png new file mode 100644 index 0000000..697061e Binary files /dev/null and b/slides/figures/imagenet.png differ diff --git a/slides/figures/imagenet_challenge.png b/slides/figures/imagenet_challenge.png new file mode 100644 index 0000000..8b90c92 Binary files /dev/null and b/slides/figures/imagenet_challenge.png differ diff --git a/slides/figures/iminuit_minos_scan-1.png b/slides/figures/iminuit_minos_scan-1.png new file mode 100644 index 0000000..58f5a85 Binary files /dev/null and b/slides/figures/iminuit_minos_scan-1.png differ diff --git a/slides/figures/iminuit_minos_scan-2.png b/slides/figures/iminuit_minos_scan-2.png new file mode 100644 index 0000000..0584938 Binary files /dev/null and b/slides/figures/iminuit_minos_scan-2.png differ diff --git a/slides/figures/iris_dataset.png b/slides/figures/iris_dataset.png new file mode 100644 index 0000000..cf79956 Binary files /dev/null and b/slides/figures/iris_dataset.png differ diff --git a/slides/figures/keras.png b/slides/figures/keras.png new file mode 100644 index 0000000..723ca74 Binary files /dev/null and b/slides/figures/keras.png differ diff --git a/slides/figures/knn.png b/slides/figures/knn.png new file mode 100644 index 0000000..fcbad16 Binary files /dev/null and b/slides/figures/knn.png differ diff --git a/slides/figures/logistic_fct.png b/slides/figures/logistic_fct.png new file mode 100644 index 0000000..6c0bb80 Binary files /dev/null and b/slides/figures/logistic_fct.png differ diff --git a/slides/figures/loss_fct.png b/slides/figures/loss_fct.png new file mode 100644 index 0000000..5f6b621 Binary files /dev/null and b/slides/figures/loss_fct.png differ diff --git a/slides/figures/magic_photo.png b/slides/figures/magic_photo.png new file mode 100644 index 0000000..8ef520d Binary files /dev/null and b/slides/figures/magic_photo.png differ diff --git a/slides/figures/magic_photo_small.png b/slides/figures/magic_photo_small.png new file mode 100644 index 0000000..fd84ce3 Binary files /dev/null and b/slides/figures/magic_photo_small.png differ diff --git a/slides/figures/magic_shower_em_had.png b/slides/figures/magic_shower_em_had.png new file mode 100644 index 0000000..92edd27 Binary files /dev/null and b/slides/figures/magic_shower_em_had.png differ diff --git a/slides/figures/magic_shower_em_had_small.png b/slides/figures/magic_shower_em_had_small.png new file mode 100644 index 0000000..ee51248 Binary files /dev/null and b/slides/figures/magic_shower_em_had_small.png differ diff --git a/slides/figures/magic_shower_parameters.png b/slides/figures/magic_shower_parameters.png new file mode 100644 index 0000000..4a9871e Binary files /dev/null and b/slides/figures/magic_shower_parameters.png differ diff --git a/slides/figures/magic_sketch.png b/slides/figures/magic_sketch.png new file mode 100644 index 0000000..9f66e62 Binary files /dev/null and b/slides/figures/magic_sketch.png differ diff --git a/slides/figures/matplotlib_Figure_1.png b/slides/figures/matplotlib_Figure_1.png new file mode 100644 index 0000000..b4129f1 Binary files /dev/null and b/slides/figures/matplotlib_Figure_1.png differ diff --git a/slides/figures/matplotlib_Figure_2.png b/slides/figures/matplotlib_Figure_2.png new file mode 100644 index 0000000..e013010 Binary files /dev/null and b/slides/figures/matplotlib_Figure_2.png differ diff --git a/slides/figures/matplotlib_Figure_3.png b/slides/figures/matplotlib_Figure_3.png new file mode 100644 index 0000000..b819274 Binary files /dev/null and b/slides/figures/matplotlib_Figure_3.png differ diff --git a/slides/figures/matplotlib_Figure_4.png b/slides/figures/matplotlib_Figure_4.png new file mode 100644 index 0000000..7e70aff Binary files /dev/null and b/slides/figures/matplotlib_Figure_4.png differ diff --git a/slides/figures/mini_boone_decisions_tree.png b/slides/figures/mini_boone_decisions_tree.png new file mode 100644 index 0000000..55f0961 Binary files /dev/null and b/slides/figures/mini_boone_decisions_tree.png differ diff --git a/slides/figures/ml_example_spam.png b/slides/figures/ml_example_spam.png new file mode 100644 index 0000000..f3ad26e Binary files /dev/null and b/slides/figures/ml_example_spam.png differ diff --git a/slides/figures/mlp.png b/slides/figures/mlp.png new file mode 100644 index 0000000..fc791a2 Binary files /dev/null and b/slides/figures/mlp.png differ diff --git a/slides/figures/mnist.png b/slides/figures/mnist.png new file mode 100644 index 0000000..14a7161 Binary files /dev/null and b/slides/figures/mnist.png differ diff --git a/slides/figures/monitoring_overtraining.png b/slides/figures/monitoring_overtraining.png new file mode 100644 index 0000000..bdc9a0b Binary files /dev/null and b/slides/figures/monitoring_overtraining.png differ diff --git a/slides/figures/mva.png b/slides/figures/mva.png new file mode 100644 index 0000000..578d268 Binary files /dev/null and b/slides/figures/mva.png differ diff --git a/slides/figures/mva_nn.png b/slides/figures/mva_nn.png new file mode 100644 index 0000000..8c7077d Binary files /dev/null and b/slides/figures/mva_nn.png differ diff --git a/slides/figures/neuron.png b/slides/figures/neuron.png new file mode 100644 index 0000000..d8dea7b Binary files /dev/null and b/slides/figures/neuron.png differ diff --git a/slides/figures/nn_decision_boundary.png b/slides/figures/nn_decision_boundary.png new file mode 100644 index 0000000..4e0745d Binary files /dev/null and b/slides/figures/nn_decision_boundary.png differ diff --git a/slides/figures/pandas_crosstabplot.png b/slides/figures/pandas_crosstabplot.png new file mode 100644 index 0000000..fea9408 Binary files /dev/null and b/slides/figures/pandas_crosstabplot.png differ diff --git a/slides/figures/pandas_histogramm.png b/slides/figures/pandas_histogramm.png new file mode 100644 index 0000000..ecec461 Binary files /dev/null and b/slides/figures/pandas_histogramm.png differ diff --git a/slides/figures/pandas_scatterplot.png b/slides/figures/pandas_scatterplot.png new file mode 100644 index 0000000..d546f8a Binary files /dev/null and b/slides/figures/pandas_scatterplot.png differ diff --git a/slides/figures/pdf_from_2d_histogram.png b/slides/figures/pdf_from_2d_histogram.png new file mode 100644 index 0000000..736e93b Binary files /dev/null and b/slides/figures/pdf_from_2d_histogram.png differ diff --git a/slides/figures/perceptron_photo.png b/slides/figures/perceptron_photo.png new file mode 100644 index 0000000..badacb5 Binary files /dev/null and b/slides/figures/perceptron_photo.png differ diff --git a/slides/figures/perceptron_retina.png b/slides/figures/perceptron_retina.png new file mode 100644 index 0000000..4b512ce Binary files /dev/null and b/slides/figures/perceptron_retina.png differ diff --git a/slides/figures/perceptron_weighted_sum.png b/slides/figures/perceptron_weighted_sum.png new file mode 100644 index 0000000..2f6a49f Binary files /dev/null and b/slides/figures/perceptron_weighted_sum.png differ diff --git a/slides/figures/perceptron_with_threshold.png b/slides/figures/perceptron_with_threshold.png new file mode 100644 index 0000000..0ee3a97 Binary files /dev/null and b/slides/figures/perceptron_with_threshold.png differ diff --git a/slides/figures/regularization.png b/slides/figures/regularization.png new file mode 100644 index 0000000..90e0c3b Binary files /dev/null and b/slides/figures/regularization.png differ diff --git a/slides/figures/relu.png b/slides/figures/relu.png new file mode 100644 index 0000000..317d730 Binary files /dev/null and b/slides/figures/relu.png differ diff --git a/slides/figures/rootOptions.png b/slides/figures/rootOptions.png new file mode 100644 index 0000000..21f984a Binary files /dev/null and b/slides/figures/rootOptions.png differ diff --git a/slides/figures/scikit-learn.png b/slides/figures/scikit-learn.png new file mode 100644 index 0000000..ca0a9a5 Binary files /dev/null and b/slides/figures/scikit-learn.png differ diff --git a/slides/figures/sigmoid.png b/slides/figures/sigmoid.png new file mode 100644 index 0000000..32dcfdb Binary files /dev/null and b/slides/figures/sigmoid.png differ diff --git a/slides/figures/signal_background_distr.png b/slides/figures/signal_background_distr.png new file mode 100644 index 0000000..f73007a Binary files /dev/null and b/slides/figures/signal_background_distr.png differ diff --git a/slides/figures/signal_purity.png b/slides/figures/signal_purity.png new file mode 100644 index 0000000..0cffdc9 Binary files /dev/null and b/slides/figures/signal_purity.png differ diff --git a/slides/figures/stochastic_gradient_descent.png b/slides/figures/stochastic_gradient_descent.png new file mode 100644 index 0000000..48a07a9 Binary files /dev/null and b/slides/figures/stochastic_gradient_descent.png differ diff --git a/slides/figures/supervised_learning_car_plane.png b/slides/figures/supervised_learning_car_plane.png new file mode 100644 index 0000000..e9a1842 Binary files /dev/null and b/slides/figures/supervised_learning_car_plane.png differ diff --git a/slides/figures/supervised_nutshell.png b/slides/figures/supervised_nutshell.png new file mode 100644 index 0000000..7ece776 Binary files /dev/null and b/slides/figures/supervised_nutshell.png differ diff --git a/slides/figures/tensorflow.png b/slides/figures/tensorflow.png new file mode 100644 index 0000000..85fdf6f Binary files /dev/null and b/slides/figures/tensorflow.png differ diff --git a/slides/figures/tf_playground.png b/slides/figures/tf_playground.png new file mode 100644 index 0000000..725efd7 Binary files /dev/null and b/slides/figures/tf_playground.png differ diff --git a/slides/figures/tree_pruning_slides.png b/slides/figures/tree_pruning_slides.png new file mode 100644 index 0000000..671a64e Binary files /dev/null and b/slides/figures/tree_pruning_slides.png differ diff --git a/slides/figures/underfitting_overfitting.pdf b/slides/figures/underfitting_overfitting.pdf new file mode 100644 index 0000000..89e4197 Binary files /dev/null and b/slides/figures/underfitting_overfitting.pdf differ diff --git a/slides/figures/underfitting_overfitting_001.png b/slides/figures/underfitting_overfitting_001.png new file mode 100644 index 0000000..e3a5221 Binary files /dev/null and b/slides/figures/underfitting_overfitting_001.png differ diff --git a/slides/figures/videogame.png b/slides/figures/videogame.png new file mode 100644 index 0000000..ee2a654 Binary files /dev/null and b/slides/figures/videogame.png differ diff --git a/slides/figures/xor.png b/slides/figures/xor.png new file mode 100644 index 0000000..9d872c2 Binary files /dev/null and b/slides/figures/xor.png differ diff --git a/slides/figures/xor_like_data.pdf b/slides/figures/xor_like_data.pdf new file mode 100644 index 0000000..c312cd7 Binary files /dev/null and b/slides/figures/xor_like_data.pdf differ diff --git a/slides/fit_intro.md b/slides/fit_intro.md new file mode 100644 index 0000000..6904f88 --- /dev/null +++ b/slides/fit_intro.md @@ -0,0 +1,563 @@ +--- +title: | + | Introduction to Data Analysis and Machine Learning in Physics: + | 2. Data modeling and fitting + +author: "Martino Borsato, Jörg Marks, Klaus Reygers" +date: "Studierendentage, 11-14 April 2022" +--- + +## Data modeling and fitting - introduction + +Data analysis is a process of understanding and modeling measured +data. The goal is to find patterns and to obtain inferences allowing to +observe underlying patterns. + + * There are 2 approaches to statistical data modeling + * Hypothesis testing: is our data compatible with a certain model? + * Determination of model parameter: use the data to determine the parameters + of a (theoretical) model + + * For the determination of model parameter + * Analysis of data distributions $\rightarrow$ mean, variance, + median, FWHM, .... \newline + allows for an approximate determination of model parameter + + * Data fitting with the least square method $\rightarrow$ an iterative + process which minimizes the deviation of a model decribed by parameters + from data. This determines the optimal values and uncertainties + of the parameters. + + * Maximum likelihood fitting $\rightarrow$ find a set of model parameters + which most likely describe the data by maximizing the probability + distributions. + +The parameter determination by minimization is an integral part of machine +learning approaches, here a system learns patterns and predicts +related ones. This is the focus in the upcoming days. + +## Data modeling and fitting - introduction + +Data analysis is a process of understanding and modeling measured +data. The goal is to find patterns and to obtain inferences allowing to +observe underlying patterns. + + * There are 2 approaches to statistical data modeling + * Hypothesis testing: is our data compatible with a certain model? + * Determination of model parameter: use the data to determine the parameters + of a (theoretical) model + + * For the determination of model parameter + * Analysis of data distributions $\rightarrow$ mean, variance, + median, FWHM, .... \newline + allows for an approximate determination of model parameter + + \setbeamertemplate{itemize subitem}{\color{red}\tiny$\blacksquare$} + * \textcolor{blue}{Data fitting with the least square method + $\rightarrow$ an iterative + process which minimizes the deviation of a model decribed by parameters + from data. This determines the optimal values and uncertainties + of the parameters.} + + \setbeamertemplate{itemize subitem}{\color{blue}\tiny$\blacktriangleright$} + * Maximum likelihood fitting $\rightarrow$ find a set of model parameters + which most likely describe the data by maximizing the probability + distributions. + +The parameter determination by minimization is an integral part of machine +learning approaches, here a system learns patterns and predicts +related ones. This is the focus in the upcoming days. + + + +## Least Square (LS) Method (1) + +The method determines the \textcolor{blue}{optimal parameters of functions + to gaussian distributed measurements}. + +Lets consider a sample of $n$ measurements $y_{i}$ and a parametrized +description of the measurement $\eta_{i} = f(x_{i} | \theta)$ +with a parameter set $\theta = \theta_{1}, \theta_{2} ,.... \theta_{k}$, +dependent values $x_{i}$ and measurement errors $\sigma_{i}$. + +The parameter set should be determined such that +\begin{equation*} + \color{blue}{S = \sum \limits_{i=1}^{n} \frac{(y_i-\eta_i)^2}{\sigma_i^2} = \sum \limits_{i=1}^{n} \frac{(y_i- f(x_i|\theta))^2}{\sigma_i^2} \longrightarrow \, minimal } +\end{equation*} +In case of correlated measurements the covariance matrix of the $y_{i}$ has to +be taken into account. This is accomplished by defining a weight matrix from +the covariance matrix of the input data. A decorrelation of the input data +should be considered. +\vspace{0.2cm} + +$S$ follows a $\chi^{2}$-distribution with $(n-k)$ degrees of freedom. + +## Least Square (LS) Method (2) + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} +* Example LS-method + \vspace{0.2cm} + + Often the fit function $f(x, \theta)$ is linear in + $\theta = \theta_{1}, \theta_{2} ,.... \theta_{k}$ + \vspace{0.2cm} + + $f(x | \theta) = \theta_{1} f_{1}(x) + .... + \theta_{k} f_{k}(x)$ + \vspace{0.2cm} + + If the model is a straight line and our parameters are $\theta_{1}$ and + $\theta_{2}$ $(f_{1}(x) = 1,$ $f_{2}(x) = x)$ we have + $f(x | \theta) = \theta_{1} + \theta_{2} x$ + \vspace{0.2cm} + + The LS equation is + \vspace{0.2cm} + + $\color{blue}{S = \sum \limits_{i=1}^{n} \frac{(y_i-\eta_i)^2}{\sigma_i^2} } \color{black} {= \sum + \limits_{i=1}^{n} \frac{(y_{i} - \theta_{1} - x_{i} + \theta_{2})^2}{\sigma_i^2 }}$ \hspace{0.4cm} and with + \vspace{0.2cm} + + $\frac{\partial S}{\partial \theta_1} = \sum\limits_{i=1}^{n} \frac{-2 + (y_i - \theta_1 - x_i \theta_2)}{\sigma_i^2} = 0$ \hspace{0.4cm} and \hspace{0.4cm} + $\frac{\partial S}{\partial \theta_2} = \sum\limits_{i=1}^{n} \frac{-2 x_i (y_i - \theta_1 - x_i \theta_2)}{\sigma_i^2} = 0$ + \vspace{0.2cm} + + the parameters $\theta_{1}$ and $\theta_{2}$ can be determined. + + \vspace{0.2cm} + \textcolor{olive}{In case of linear fit functions solutions can be found by matrix inversion} + + \vfill + +## Least Square (LS) Method (3) + + \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + +* Use of a nonlinear fit function $f(x, \theta)$ like \hspace{0.4cm} + $f(x | \theta) = \theta_{1} \cdot e^{-\theta_{2} x}$ + \vspace{0.2cm} + + results in the LS equation + \vspace{0.2cm} + + $\color{blue}{S = \sum \limits_{i=1}^{n} \frac{(y_i-\eta_i)^2}{\sigma_i^2} } \color{black} {= \sum \limits_{i=1}^{n} \frac{(y_{i} - \theta_{1} \cdot e^{-\theta_{2} x_{i}})^2}{\sigma_i^2 }}$ \hspace{0.4cm} + \vspace{0.2cm} + + which we have to minimize + \vspace{0.2cm} + + $\frac{\partial S}{\partial \theta_1} = \sum\limits_{i=1}^{n} \frac{ 2 e^{-2 \theta_2 x_i} ( \theta_1 - y_i e^{\theta_2 x_i} )} {\sigma_i^2 } = 0$ \hspace{0.4cm} and \hspace{0.4cm} + $\frac{\partial S}{\partial \theta_2} = \sum\limits_{i=1}^{n} \frac{ 2 \theta_1 x_I e^{-2 \theta_2 x_i} (y_i e^{\theta_2 x_i} - \theta_1)} {\sigma_i^2 } = 0$ + + \vspace{0.4cm} + + In a nonlinear system, the LS Ansatz leads to derivatives which are + functions of the independent variable and the parameters $\color{red}\rightarrow$ \textcolor{olive}{no closed solutions} + \vspace{0.4cm} + + In general, we have gradient equations which don't have closed solutions. + There are a couple of methods including approximations which allow together + with numerical methods to find a global minimum, Gauss–Newton algorithm, + Levenberg–Marquardt algorithm, gradient descend methods and also direct + search methods. + +## Minuit - a programm package for minimization (1) + +In general data fitting and also solving machine learning algorithms lead +to a minimization problem of functions. In the +1975-1980 F. James (CERN) developed +a FORTRAN-based package, [\textcolor{violet}{MINUIT}](http://seal.web.cern.ch/seal/documents/minuit/mntutorial.pdf), which is a framework to handle +multiparameter minimization and compute the best-fit parameter values and +uncertainties, including correlations between the parameters. +\vspace{0.2cm} + +The user provides a minimization function +$F(X,P)$ with the parameter space $P=(p_1,....p_k)$ and +variable space $X$ (also multi-dimensional). There is an interface via +functions which influences the +the minimization process. MINUIT provides +[\textcolor{violet}{error calculations}](http://seal.web.cern.ch/seal/documents/minuit/mnerror.pdf) including correlations for the parameter space by evaluating the shape of the function in some neighbourhood of the minimum. +\vspace{0.2cm} + +The package +has now a new object-oriented implementation as [\textcolor{violet}{Minuit2 library}](https://root.cern.ch/doc/master/Minuit2Page.html) , written +in C++. +\vspace{0.2cm} + +During the minimization $F(X,P)$ is evaluated for various $X$. For the +choice of $P=(p_1,....p_k)$ different methods are used + +## Minuit - a programm package for minimization (2) + +\vspace{0.4cm} +\textcolor{olive}{SEEK}: Search for the minimum with Monte Carlo methods, mostly used at the start + of the minimization with unknown starting values. It is not a converging + algorithm. + \vspace{0.2cm} + +\textcolor{olive}{SIMPLX}: + Uses the simplex method of Nelder and Mead. Function values are compared + in the parameter space. Via step size control the minimum is approached. + Parameter errors are only approximate, no covariance matrix is calculated. +\vspace{0.2cm} + + + +\textcolor{olive}{MIGRAD}: + Uses an algorithm of R. Fletcher, which takes the function and the gradient + to approach the minimum with a variable metric method. An error matrix and + correlation coefficients are available + \vspace{0.2cm} + +\textcolor{olive}{HESSE}: + Calculates the hessian matrix of second derivatives and determines the + covariance matrix. + \vspace{0.2cm} + +\textcolor{olive}{MINOS}: + Calculates (asymmetric) errors using likelihood profiles. + The algorithm for finding the positive and negative MINOS errors for parameter + $n$ consists of varying $n$ each time minimizing $F(X,P)$ with respect to + all the others. + \vspace{0.2cm} + +## Minuit - a programm package for minimization (3) + +\vspace{0.4cm} + +Fit process with the minuit package +\vspace{0.2cm} + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + +* The individual steps decribed above can be called several times and in different order during the minimization process. + +* Each of the parameters $p_i$ of $P=(p_1,....p_k)$ can be set constant and + released during the minimization steps. + +* Problems are expected in models with strong correlation between + parameters $\rightarrow$ change model to uncorrelated definitions + +* Local minima, edges/steps or undefined ranges in $F(X,P)$ are problematic + $\rightarrow$ simplify your model + + \vspace{3cm} + + +## Minuit2 - The iminuit package + +\vspace{0.4cm} + + [\textcolor{violet}{iminuit}](https://iminuit.readthedocs.io/en/stable/) is + a Jupyter-friendly Python interface for the Minuit2 C++ library. +\vspace{0.2cm} + + \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + +* The class `iminuit.Minuit` instanciates the minuit object. The minimizer + function is given as argument. Basic steering of the fit + like setting start parameters, error definition and print level is also + done here. + +\footnotesize +```python + from iminuit import Minuit + def fcn(x, y, z): # definition of the minimizer function + return (x - 2) ** 2 + (y - x) ** 2 + (z - 4) ** 2 + m = Minuit(fcn, x=0, y=0, z=0, errordef=1 , print_level=1) +``` +\normalsize + + * Several methods determine the interaction with the fitting process, calls + to `migrad` , `hesse` or printing of parameters and errors + +\footnotesize +```python + ...... + m.migrad() # run optimiser + print(m.values , m.errors) # print results + m.hesse() # run covariance estimator +``` +\normalsize + +## Minuit2 - iminuit example + +\vspace{0.2cm} + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + + * The function `fcn` describes the model with parameters to be determined by + data.`fcn` is minimal when the model parameters agree best with data. + `fcn` has positional arguments, one for each fit parameter. `iminuit` + example fit: + + [\textcolor{violet}{02\_fit\_exp\_fit\_iMinuit.py}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_exp_fit_iMinuit.py) + +\footnotesize +```python + ...... + x = np.array([....],dtype='d') # measurements x + y = np.array([....],dtype='d') # measurements y + dy = np.array([....],dtype='d') # error in y + def xp(a, b , c): + return a * np.exp(b*x) + c + # least-squares function = sum of data residuals squared + def fcn(a,b,c): + return np.sum((y - xp(a,b,c)) ** 2 / dy ** 2) + # limit the range of b and fix parameter c + m = Minuit(fcn,a=1,b=-0.7,c=1,limit_b=(-1,0.1),fix_c=True) + m.migrad() # run minimizer + m.fixed["c"] = False # release parameter c + m.migrad() # rerun minimizer +``` +\normalsize + + * Might be useful to fix parameters or limit the range for some applications + +## Minuit2 - iminuit (3) + +\vspace{0.2cm} + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + +* Results and control information of the fit can be printed and accessed + in the the prorgamm. + +\footnotesize +```python + ...... + m = Minuit(fcn,....,print_level=1) # set flag in the initializer + m.migrad() # run minimizer + a_fit = m.values['a'] # get parameter value a + a_fit_error = m.errors['a'] # get parameter error of a + print (m.values,m.errors) # print results + ``` +\normalsize + +* After processing Hesse, covariance and correlation information of the + fit is available + +\footnotesize +```python + ...... + m.hesse() # run covariance estimator + m.matrix() # get covariance matrix + m.matrix(correlation=True) # get full correlation matrix + cov = m.np_matrix() # save matrix to numpy + cor = m.np_matrix(correlation=True) + print(cor[0, 1]) # print correlation between parameter 1 and 2 + ``` +\normalsize + +## Minuit2 - iminuit (4) + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + + * Minos provides asymmetric uncertainty intervals and parameter contours by + scanning one parameter and minimizing the function with respect to all other + parameters for each scan point. Results are displayed with `matplotlib`. + +\footnotesize +```python + ...... + m.minos() + print (m.get_merrors()['a']) + m.draw_mnprofile('b') + m.draw_mncontour('a', 'b', nsigma=4) +``` +::: columns +:::: {.column width=40%} +![](figures/iminuit_minos_scan-1.png) +:::: +:::: {.column width=40%} +![](figures/iminuit_minos_scan-2.png) +:::: +::: + +## Exercise 3 + +Plot the following data with mathplotlib as in the iminuit example: + + \footnotesize +``` + x: 0.2,0.4,0.6,0.8,1.,1.2,1.4,1.6,1.8,2.,2.2,2.4,2.6,2.8,3.,3.2, + 3.4,3.6, 3.8,4. + y: 0.04,0.021,0.035,0.03,0.029,0.019,0.024,0.018,0.019,0.022,0.02, + 0.025,0.018,0.024,0.019,0.021,0.03,0.019,0.03,0.024 + dy: 1.792,1.695,1.541,1.514,1.427,1.399,1.388,1.270,1.262,1.228,1.189, + 1.182,1.121,1.129,1.124,1.089,1.092,1.084,1.058,1.057 +``` +\normalsize + \setbeamertemplate{itemize item}{\color{red}$\square$} + +* Exchange in the example iminuit fit `02_fit_exp_fit_iMinuit.ipynb` the + exponential function by a 3rd order polynomial and perform the fit + +* Compare the correlation of the parameters of the exponential and + the polynomial fit + +* What defines the fit quality, give an estimate + + \small + Solution: [\textcolor{violet}{02\_fit\_ex\_3\_sol.py}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/02_fit_ex_3_sol.py) \normalsize + +## Exercise 4 + +Plot the following data with mathplotlib: + + \footnotesize +``` + x: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 + dx: 0.1,0.1,0.5,0.1,0.5,0.1,0.5,0.1,0.5,0.1 + y: 1.1,2.3,2.7,3.2,3.1,2.4,1.7,1.5,1.5,1.7 + dy: 0.15,0.22,0.29,0.39,0.31,0.21,0.13,0.15,0.19,0.13 +``` +\normalsize + \setbeamertemplate{itemize item}{\color{red}$\square$} + + * Perform a fit with iminuit. Which model do you use? + + * Plot the resulting fit function in the graph with the data + + * Print the covariance matrix. Can we improve the errors. + + * Can you draw a contour plot of 2 of the fit parameters. + + \small + Solution: [\textcolor{violet}{02\_fit\_ex\_4\_sol.py}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/02_fit_ex_4_sol.py) \normalsize + + +## PyROOT + +[\textcolor{violet}{PyROOT}](https://root.cern/manual/python/) is the python binding for the C++ data analysis toolkit [\textcolor{violet}{ROOT}](https://root.cern/) developed with and for the LHC community. You can access the full +ROOT functionality from Python while +benefiting from the performance of the ROOT C++ libraries. The PyROOT bindings +are automatic and dynamic and are able to interoperate with widely-used Python +data-science libraries as `NumPy`, `pandas`, SciPy `scikit-learn` and `tensorflow`. + +* ROOT/PyROOT can be installed easily within anaconda3 (ROOT version 6.22.02 + or later ) or is available in the + [\textcolor{violet}{CIP jupyter2 Hub}](https://jupyter2.kip.uni-heidelberg.de/) + +* Tools for statistical analysis, a math library with optimized algorithms, + multivariate analysis, visualization and simulation of data. + +* Storing data including objects and classes with compression in files is a + very powerfull aspect for any data analysis project + +* Within PyROOT Minuit2 can be accessed easily either with predefined functions + or your own function definition + +* For advanced statistical analyses and data modeling likelihood fitting with + the packages **rooFit** and **rooStats** is available. + + +## + +* Example reading the invariant mass measurements of a $D^0$ from a text file + and determine $\mu$ and $\sigma$ \hspace{1.0cm} \small + [\textcolor{violet}{02\_fit\_histFit.py}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_histFit.py) + \normalsize + +\footnotesize +```python + import numpy as np + import math + from ROOT import TCanvas, TFile, TH1D, TF1, TMinuit, TFitResult + data = np.genfromtxt('D0Mass.txt', dtype='d') # read data from text file + c = TCanvas('c','D0 Mass',200,10,700,500) # instanciate output canvas + d0 = TH1D('d0','D0 Mass',200,1700.,2000.) # instanciate histogramm + for x in data : # fill data into histogramm d0 + d0.Fill(x) + def pyf_tf1_params(x, p): # define fit function + return p[0] * math.exp (-0.5 * ((x[0] - p[1])**2 / p[2]**2)) + func = TF1("func",pyf_tf1_params,1840.,1880.,3) + # func = TF1("func",'gaus',1840.,1880.) # use predefined function + func.SetParameters(500.,1860.,5.5) # set start parameters + myfit = d0.Fit(func,"S") # fit function to the histogramm data + print ("Fit results: mean=",myfit.Parameter(0)," +/- ",myfit.ParError(0)) + c.Draw() # draw canvas + myfile = TFile('myOutFile.root','RECREATE') # Open a ROOT file for output + c.Write() # Write canvas + d0.Write() # Write histogram + myfile.Close() # close file +``` +\normalsize + + +## + +* Fit Options +\vspace{0.1cm} + +::: columns +:::: {.column width=2%} +:::: +:::: {.column width=98%} +![](figures/rootOptions.png) +:::: +::: + +## Exercise 5 + + Read text file [\textcolor{violet}{FitTestData.txt}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/exercises/FitTestData.txt) and draw a histogramm using PyROOT. + \setbeamertemplate{itemize item}{\color{red}$\square$} + +* Determine the mean and sigma of the signal distribution. Which function do + you use for fitting? + +* The option S fills the result object. + +* Try to improve the errors of the fit values with minos using the option E + and also try the option M to scan for a new minimum, option V provides more + output. + +* Fit the background outside the signal region use the option R+ to add the + function to your fit + + \small + Solution: [\textcolor{violet}{02\_fit\_ex\_5\_sol.py}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/02_fit_ex_5_sol.py) \normalsize + + +## iPython Examples for Fitting + + The different python packages are used in + \textcolor{blue}{example iPython notebooks} + to demonstrate the fitting of a third order polynomial to the same data + available as numpy arrays. + + \setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + + * LSQ fit of a polynomial to data using Minuit2 with + \textcolor{blue}{iminuit} and \textcolor{blue}{matplotlib} plot: + + \small + [\textcolor{violet}{02\_fit\_iminuitFit.ipynb}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_iminuitFit.ipynb) + \normalsize + + * Graph fitting with \textcolor{blue}{pyROOT} with options using a python + function including confidence level plot: + + \small + [\textcolor{violet}{02\_fit\_fitGraph.ipynb}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_fitGraph.ipynb) + \normalsize + + * Graph fitting with \textcolor{blue}{numpy} and confidence level + plotting with \textcolor{blue}{matplotlib}: + + \small + [\textcolor{violet}{02\_fit\_numpyFit.ipynb}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_numpyFit.ipynb) + \normalsize + + * Graph fitting with a polynomial fit of \textcolor{blue}{scikit-learn} and + plotting with \textcolor{blue}{matplotlib}: + + \normalsize + \small + [\textcolor{violet}{02\_fit\_scikitFit.ipynb}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/examples/02_fit_scikitFit.ipynb) + \normalsize diff --git a/slides/intro_python.md b/slides/intro_python.md new file mode 100644 index 0000000..c214823 --- /dev/null +++ b/slides/intro_python.md @@ -0,0 +1,830 @@ +--- +title: | + | Introduction to Data Analysis and Machine Learning in Physics: + | 1. Introduction to python + +author: "Martino Borsato, Jörg Marks, Klaus Reygers" +date: "Studierendentage, 11-14 April 2022" +--- + +## Outline of the $1^{st}$ day + +* Technical instructions for your interactions with the CIP pool, like + * using the jupyter hub + * using python locally in your own linux environment (anaconda) + * access the CIP pool from your own windows or linux system + * transfer data from and to the CIP pool + + Can be found in [\textcolor{violet}{CIPpoolAccess.PDF}](https://www.physi.uni-heidelberg.de/~marks/root_einfuehrung/Folien/CIPpoolAccess.pdf)\normalsize + +* Summary of NumPy + +* Plotting with matplotlib + +* Input / output of data + +* Summary of pandas + +* Fitting with iminuit and pyROOT + + +## A glimpse into python classes + + The following python classes are important to data analysis and machine + learning will be used during the course + + * [\textcolor{violet}{NumPy}](https://numpy.org/doc/stable/user/basics.html) - python library adding support for large, + multi-dimensional arrays and matrices, along with high-level + mathematical functions to operate on these arrays + + * [\textcolor{violet}{matplotlib}](https://matplotlib.org/stable/tutorials/index.html) - a python plotting library + + * [\textcolor{violet}{SciPy}](https://docs.scipy.org/doc/scipy/reference/tutorial/index.html) - extension of NumPy by a collection of + mathematical algorithms for minimization, regression, + fourier transformation, linear algebra and image processing + + * [\textcolor{violet}{iminuit}](https://iminuit.readthedocs.io/en/stable/) - + python wrapper to the data fitting toolkit + [\textcolor{violet}{Minuit2}](https://root.cern.ch/doc/master/Minuit2Page.html) + developed at CERN by F. James in the 1970ies + + * [\textcolor{violet}{pyROOT}](https://root.cern/manual/python/) - python wrapper to the C++ data analysis toolkit + ROOT used at the LHC + + * [\textcolor{violet}{scikit-learn}](https://scikit-learn.org/stable/) - machine learning library written in + python, which makes use extensively of NumPy for high-performance + linear algebra algorithms + +## NumPy + + \textcolor{blue}{NumPy} (Numerical Python) is an open source Python library, + which contains multidimensional array and matrix data structures and methods + to efficiently operate on these. The core object is + a homogeneous n-dimensional array object, \textcolor{blue}{ndarray}, which + allows for a wide variety of \textcolor{blue}{fast operations and mathematical calculations + with arrays and matrices} due to the extensive usage of compiled code. + + * It is heavily used in numerous scientific python packages + + * `ndarray` 's have a fixed size at creation $\rightarrow$ changing size + leads to recreation + + * Array elements are all required to be of the same data type + + * Facilitates advanced mathematical operations on large datasets + + * See for a summary, e.g.    + \small +[\textcolor{violet}{https://cs231n.github.io/python-numpy-tutorial/\#numpy}](https://cs231n.github.io/python-numpy-tutorial/#numpy) \normalsize + +\vfill + +::: columns +:::: {.column width=30%} + +:::: +::: + +::: columns +:::: {.column width=35%} + +`c = []` + +`for i in range(len(a)):` + +    `c.append(a[i]*b[i])` + +:::: + +:::: {.column width=35%} + +with NumPy + +`c = a * b` + +:::: +::: + + + +## NumPy - array basics + +* numpy arrays build a grid of \textcolor{blue}{same type} values, which are indexed. + The *rank* is the dimension of the array. + There are methods to create and preset arrays. + +\footnotesize + +```python + myA = np.array([2, 5 , 11]) # create rank 1 array (vector like) + type(myA) # + myA.shape # (3,) + print(myA[2]) # 11 access 3. element + myA[0] = 12 # set 1. element to 12 + myB = np.array([[1,5],[7,9]]) # create rank 2 array + myB.shape # (2,2) + print(myB[0,0],myB[0,1],myB[1,1]) # 1 5 9 + myC = np.arange(6) # create rank 1 set to 0 - 5 + myC.reshape(2,3) # change rank to (2,3) + + zero = np.zeros((2,5)) # 2 rows, 5 columns, set to 0 + one = np.ones((2,2)) # 2 rows, 2 columns, set to 1 + five = np.full((2,2), 5) # 2 rows, 2 columns, set to 5 + e = np.eye(2) # create 2x2 identity matrix +``` +\normalsize + + +## NumPy - array indexing (1) + +* select slices of a numpy array + +\footnotesize +```python + a = np.array([[1,2,3,4], + [5,6,7,8], # 3 rows 4 columns array + [9,10,11,12]]) + b = a[:2, 1:3] # subarray of 2 rows and + array([[2, 3], # column 1 and 2 + [6, 7]]) +``` +\normalsize + +* a slice of an array points into the same data, *modifying* changes the original array! + +\footnotesize +```python + b[0, 0] = 77 # b[0,0] and a[0,1] are 77 + + r1_row = a[1, :] # get 2nd row -> rank 1 + r1_row.shape # (4,) + r2_row = a[1:2, :] # get 2nd row -> rank 2 + r2_row.shape # (1,4) + a=np.array([[1,2],[3,4],[5,6]]) # set a , 3 rows 2 cols + d=a[[0, 1, 2], [0, 1, 1]] # d contains [1 4 6] + e=a[[1, 2], [1, 1]] # e contains [4 6] + np.array([a[0,0],a[1,1],a[2,0]]) # address elements explicitly +``` +\normalsize + + +## NumPy - array indexing (2) + + +* integer array indexing by setting an array of indices $\rightarrow$ selecting/changing elements + +\footnotesize +```python + a = np.array([[1,2,3,4], + [5,6,7,8], # 3 rows 4 columns array + [9,10,11,12]]) + p_a = np.array([0,2,0]) # Create an array of indices + s = a[np.arange(3), p_a] # number the rows, p_a points to cols + print (s) # s contains [1 7 9] + a[np.arange(3),p_a] += 10 # add 10 to corresponding elements + x=np.array([[8,2],[7,4]]) # create 2x2 array + bool = (x > 5) # bool : array of boolians + # [[True False] + # [True False]] + print(x[x>5]) # select elements, prints [8 7] +``` +\normalsize + +* data type in numpy - create according to input numbers or set explicitly + +\footnotesize + +```python + x = np.array([1.1, 2.1]) # create float array + print(x.dtype) # print float64 + y=np.array([1.1,2.9],dtype=np.int64) # create float array [1 2] +``` +\normalsize + + +## NumPy - functions + +* math functions operate elementwise either as operator overload or as methods + +\footnotesize +```python + x=np.array([[1,2],[3,4]],dtype=np.float64) # define 2x2 float array + y=np.array([[3,1],[5,1]],dtype=np.float64) # define 2x2 float array + s = x + y # elementwise sum + s = np.add(x,y) + s = np.subtract(x,y) + s = np.multiply(x,y) # no matrix multiplication! + s = np.divide(x,y) + s = np.sqrt(x), np.exp(x), ... + x @ y , or np.dot(x, y) # matrix product + np.sum(x, axis=0) # sum of each column + np.sum(x, axis=1) # sum of each row + xT = x.T # transpose of x + x = np.linspace(0,2*pi,100) # get equal spaced points in x + + r = np.random.default_rng(seed=42) # constructor random number class + b = r.random((2,3)) # random 2x3 matrix +``` +\normalsize + + + +## + +* broadcasting in numpy + \vspace{0.4cm} + + The term broadcasting describes how numpy treats arrays with different + shapes during arithmetic operations + + * add a scalar $b$ to a 1D array $a = [a_1,a_2,a_3]$ $\rightarrow$ expand $b$ to + $[b,b,b]$ + \vspace{0.2cm} + + * add a scalar $b$ to a 2D [2,3] array $a =[[a_{11},a_{12},a_{13}],[a_{21},a_{22},a_{23}]]$ + $\rightarrow$ expand $b$ to $b =[[b,b,b],[b,b,b]]$ and add element wise + \vspace{0.2cm} + + * add 1D array $b = [b_1,b_2,b_3]$ to a 2D [2,3] array $a=[[a_{11},a_{12},a_{13}],[a_{21},a_{22},a_{23}]]$ $\rightarrow$ 1D array is broadcast + across each row of the 2D array $b =[[b_1,b_2,b_3],[b_1,b_2,b_3]]$ and added element wise + \vspace{0.2cm} + + Arithmetic operations can only be performed when the shape of each + dimension in the arrays are equal or one has the dimension size of 1. Look + [\textcolor{violet}{here}](https://numpy.org/doc/stable/user/basics.broadcasting.html) for more details + +\footnotesize +```python + # Add a vector to each row of a matrix + x = np.array([[1,2,3], [4,5,6]]) # x has shape (2, 3) + v = np.array([1,2,3]) # v has shape (3,) + x + v # [[2 4 6] + # [5 7 9]] +``` +\normalsize + +## Plot data + +A popular library to present data is the `pyplot` module of `matplotlib`. + +* Drawing a function in one plot + +\footnotesize +::: columns +:::: {.column width=35%} +```python +import numpy as np +import matplotlib.pyplot as plt +# generate 100 points from 0 to 2 pi +x = np.linspace( 0, 10*np.pi, 100 ) +f = np.sin(x)**2 +# plot function +plt.plot(x,f,'blueviolet',label='sine') +plt.xlabel('x [radian]') +plt.ylabel('f(x)') +plt.title('Plot sin^2') +plt.legend(loc='upper right') +plt.axis([0,30,-0.1,1.2]) # limit the plot range + +# show the plot +plt.show() +``` +:::: +:::: {.column width=40%} +![](figures/matplotlib_Figure_1.png) +:::: +::: + +\normalsize + +## +* Drawing subplots in one canvas + +\footnotesize +::: columns +:::: {.column width=35%} +```python +... +g = np.exp(-0.2*x) +# create figure +plt.figure(num=2,figsize=(10.0,7.5),dpi=150,facecolor='lightgrey') +plt.suptitle('1 x 2 Plot') +# create subplot and plot first one +plt.subplot(1,2,1) +# plot first one +plt.title('exp(x)') +plt.xlabel('x') +plt.ylabel('g(x)') +plt.plot(x,g,'blueviolet') +# create subplot and plot second one +plt.subplot(1,2,2) +plt.plot(x,f,'orange') +plt.plot(x,f*g,'red') +plt.legend(['sine^2','exp*sine']) +# show the plot +plt.show() +``` +:::: +:::: {.column width=40%} +\vspace{3cm} +![](figures/matplotlib_Figure_2.png) +:::: +::: +\normalsize + +## Image data + +The `image` class of the `matplotlib` library can be used to load the image +to numpy arrays and to render the image. + +* There are 3 common formats for the numpy array + + * (M, N) scalar data used for greyscale images + + * (M, N, 3) for RGB images (each pixel has an array with RGB color attached) + + * (M, N, 4) for RGBA images (each pixel has an array with RGB color + and transparency attached) + + + The method `imread` loads the image into an `ndarray`, which can be + manipulated. + + The method `imshow` renders the image data + + \vspace {2cm} + +## +* Drawing pixel data and images + +\footnotesize +::: columns +:::: {.column width=50%} + +```python +.... +# create data array with pixel postion and RGB color code +width, height = 400, 400 +data = np.zeros((height, width, 3), dtype=np.uint8) +# red patch in the center +data[175:225, 175:225] = [255, 0, 0] +x = np.random.randint(0,width-1,100) +y = np.random.randint(0,height-1,100) +data[x,y]= [0,255,0] # random green pixel +plt.imshow(data) +plt.show() +.... +import matplotlib.image as mpimg +#read image into numpy array +pic = mpimg.imread('picture.jpg') +mod_pic = pic[:,:,0] # grab slice 0 of the colors +plt.imshow(mod_pic) # use default color code also +plt.colorbar() # try cmap='hot' +plt.show() +``` +:::: +:::: {.column width=25%} +![](figures/matplotlib_Figure_3.png) +\vspace{1cm} +![](figures/matplotlib_Figure_4.png) +:::: +::: +\normalsize + + +## Input / output + +For the analysis of measured data efficient input \/ output plays an +important role. In numpy, `ndarrays` can be saved and read in from files. +`load()` and `save()` functions handle numpy binary files (.npy extension) +which contain data, shape, dtype and other information required to +reconstruct the `ndarray` of the disk file. + +\footnotesize +```python + r = np.random.default_rng() # instanciate random number generator + a = r.random((4,3)) # random 4x3 array + np.save('myBinary.npy', a) # write array a to binary file myBinary.npy + b = np.arange(12) + np.savez('myComp.npz', a=a, b=b) # write a and b in compressed binary file + ...... + b = np.load('myBinary.npy') # read content of myBinary.npy into b +``` +\normalsize + +The storage and retrieval of array data in text file format is done +with `savetxt()` and `loadtxt()` methods. Parameter controling delimiter, +line separators, file header and footer can be specified. + +\footnotesize +```python + x = np.array([1,2,3,4,5,6,7]) # create ndarray + np.savetxt('myText.txt',x,fmt='%d') # write array x to text file myText.txt + ..... + y = np.loadtxt('myText.txt',dtype=int) # read content of myText.txt in y +``` +\normalsize + + +## Exercise 1 + +i) Display a numpy array as figure of a blue cross. The size should be 200 + by 200 pixel. Use as array format (M, N, 3), where the first 2 specify + the pixel positions and the last 3 the rbg color from 0:255. + - Draw in addition a red square of arbitrary position into the figure. + - Draw a circle in the center of the figure. Try to create a mask which + selects the inner part of the circle using the indexing. + + \small + [Solution: 01_intro_ex_1a_sol.py](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/01_intro_ex_1a_sol.py) \normalsize + +ii) Read data which contains pixels from the binary file horse.py into a + numpy array. Display the data and the following transformations in 4 + subplots: scaling and translation, compression in x and y, rotation + and mirroring. + + \small + [Solution: 01_intro_ex_1b_sol.py](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/01_intro_ex_1b_sol.py) \normalsize + + +## Pandas + +[\textcolor{violet}{pandas}](https://pandas.pydata.org/pandas-docs/stable/getting_started/index.html) is a software library written in Python for +\textcolor{blue}{data manipulation and analysis}. + + \vspace{0.4cm} + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + +* Offers data structures and operations for manipulating numerical tables with + integrated indexing + +* Imports data from various file formats, e.g. comma-separated values, JSON, + SQL or Excel + +* Tools for reading and writing data structures, allows analyzing, filtering, + spliting, merging and joining + +* Built on top of `NumPy` + +* Visualize the data with `matplotlib` + +* Most machine learning tools support `pandas` $\rightarrow$ + it is widely used to preprocess data sets for machine learning + +## Pandas micro introduction + +Goal: Exploring, cleaning, transforming, and visualization of data. +The basic indexable objects are + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + +* `Series` -> vector (list) of data elements of arbitrary type + +* `DataFrame` -> tabular arangement of data elements of column wise + arbitrary type + + Both allow cleaning data by removing of `empty` or `nan` data entries + +\footnotesize +```python + import numpy as np + import pandas as pd # use together with numpy + s = pd.Series([1, 3, 5, np.nan, 6, 8]) # create a Series of float64 + r = pd.Series(np.random.randn(4)) # Series of random numbers float64 + dates = pd.date_range("20130101", periods=3) # index according to dates + df = pd.DataFrame(np.random.randn(3,4),index=dates,columns=list("ABCD")) + print (df) # print the DataFrame + A B C D + 2013-01-01 1.618395 1.210263 -1.276586 -0.775545 + 2013-01-02 0.676783 -0.754161 -1.148029 -0.244821 + 2013-01-03 -0.359081 0.296019 1.541571 0.235337 + + new_s = s.dropna() # return a new Data Frame with no empty cells +``` +\normalsize + +## + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + +* pandas data can be saved in different file formats (CSV, JASON, html, XML, + Excel, OpenDocument, HDF5 format, .....). `NaN` entries are kept + in the output file. + + * csv file + \footnotesize + ```python + df.to_csv("myFile.csv") # Write the DataFrame df to a csv file + ``` + \normalsize + + * HDF5 output + + \footnotesize + ```python + df.to_hdf("myFile.h5",key='df',mode='w') # Write the DataFrame df to HDF5 + s.to_hdf("myFile.h5", key='s',mode='a') + ``` + \normalsize + + * Writing to an excel file + + \footnotesize + ```python + df.to_excel("myFile.xlsx", sheet_name="Sheet1") + ``` + \normalsize + +* Deleting file with data in python + +\footnotesize +```python + import os + os.remove('myFile.h5') +``` +\normalsize + +## + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + +* read in data from various formats + + * csv file + + \footnotesize + + ```python + ....... + df = pd.read_csv('heart.csv') # read csv data table + print(df.info()) + + RangeIndex: 303 entries, 0 to 302 + Data columns (total 14 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 age 303 non-null int64 + 1 sex 303 non-null int64 + 2 cp 303 non-null int64 + print(df.head(5)) # prints the first 5 rows of the data table + print(df.describe()) # shows a quick statistic summary of your data + ``` +\normalsize + + * Reading an excel file + + \footnotesize + ```python + df = pd.read_excel("myFile.xlsx","Sheet1", na_values=["NA"]) + ``` + \normalsize + + \textcolor{olive}{There are many options specifying details for IO.} + +## + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + +* Various functions exist to select and view data from pandas objects + + * Display column and index + + \footnotesize + + ```python + df.index # show datetime index of df + DatetimeIndex(['2013-01-01','2013-01-02','2013-01-03'], + dtype='datetime64[ns]',freq='D') + df.column # show columns info + Index(['A', 'B', 'C', 'D'], dtype='object') + ``` + \normalsize + + * `DataFrame.to_numpy()` gives a `NumPy` representation of the underlying data + + \footnotesize + + ```python + df.to_numpy() # one dtype for the entire array, not per column! + [[-0.62660101 -0.67330526 0.23269168 -0.67403546] + [-0.53033339 0.32872063 -0.09893568 0.44814084] + [-0.60289996 -0.22352548 -0.43393248 0.47531456]] + ``` + \normalsize + + Does not include the index or column labels in the output + + * more on viewing + + \footnotesize + + ```python + df.T # transpose the DataFrame df + df.sort_values(by="B") # Sorting by values of a column of df + df.sort_index(axis=0,ascending=False) # Sorting by index descending values + df.sort_index(axis=0,ascending=False) # Display columns in inverse order + + ``` + \normalsize + +## + +\setbeamertemplate{itemize item}{\color{red}\tiny$\blacksquare$} + +* Selecting data of pandas objects $\rightarrow$ keep or reduce dimensions + + * get a named column as a Series + + \footnotesize + + ```python + df["A"] # selects a column A from df, simular to df.A + df.iloc[:, 1:2] # slices column A explicitly from df, df.loc[:, ["A"]] + ``` + \normalsize + + * select rows of a DataFrame + + \footnotesize + + ```python + df[0:2] # selects row 0 and 1 from df, + df["20130102":"20130103"] # use indices endpoint are included! + df.iloc[3] # Select with the position of the passed integers + df.iloc[1:3, :] # selects row 1 and 2 from df + ``` + \normalsize + + * select by label + + \footnotesize + + ```python + df.loc["20130102":"20130103",["C","D"]] # selects row 1 and 2 and only C and D + df.loc[dates[0], "A"] # selects a single value (scalar) + ``` + \normalsize + + * select by lists of integer position (as in `NumPy`) + + \footnotesize + + ```python + df.iloc[[0, 2], [1, 3]] # select row 1 and 3 and col B and D + df.iloc[1, 1] # get a value explicitly + + ``` + \normalsize + + * select according to expressions + + \footnotesize + + ```python + df.query('B 0] # select df where all values of column A are >0 + df[df > 0] # select values from the entire DataFrame + ``` + \normalsize + + more complex example + + \footnotesize + + ```python + df2 = df.copy() # copy df + df2["E"] = ["eight","one","four"] # add column E + df2[df2["E"].isin(["two", "four"])] # test if elements "two" and "four" are + # contained in Series column E + ``` + \normalsize + + * Operations (in general exclude missing data) + + \footnotesize + + ```python + df2[df2 > 0] = -df2 # All elements > 0 change sign + df.mean(0) # get column wise mean (numbers=axis) + df.mean(1) # get row wise mean + df.std(0) # standard deviation according to axis + df.cumsum() # cumulative sum of each column + df.apply(np.sin) # apply function to each element of df + df.apply(lambda x: x.max() - x.min()) # apply lambda function column wise + df + 10 # add scalar 10 + df - [1, 2, 10 , 100] # subtract values of each column + df.corr() # Compute pairwise correlation of columns + ``` + \normalsize + + +## Pandas - plotting data + +[\textcolor{violet}{Visualization}](https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html) is integrated in pandas using mathplotlib. Here are only 2 examples + +* Plot random data in histogramm and scatter plot + +\footnotesize +```python + # create DataFrame with random normal distributed data + df = pd.DataFrame(np.random.randn(1000,4),columns=["a","b","c","d"]) + df = df + [1, 3, 8 , 10] # shift mean to 1, 3, 8 , 10 + plt.figure() + df.plot.hist(bins=20) # histogram all 4 columns + g1 = df.plot.scatter(x="a",y="c",color="DarkBlue",label="Group 1") + df.plot.scatter(x="b",y="d",color="DarkGreen",label="Group 2",ax=g1) +``` +\normalsize + +::: columns +:::: {.column width=35%} +![](figures/pandas_histogramm.png) +:::: +:::: {.column width=35%} +![](figures/pandas_scatterplot.png) +:::: +::: + +## Pandas - plotting data + +The function crosstab() takes one or more array-like objects as indexes or +columns and constructs a new DataFrame of variable counts on the inputs + +\footnotesize +```python + df = pd.DataFrame( # create DataFrame of 2 categories + {"sex": np.array([0,0,0,0,1,1,1,1,0,0,0]), + "heart": np.array([1,1,1,0,1,1,1,0,0,0,1]) + } ) # closing bracket goes on next line + pd.crosstab(df2.sex,df2.heart) # create cross table of possibilities + pd.crosstab(df2.sex,df2.heart).plot(kind="bar",color=['red','blue']) # plot counts +``` +\normalsize +::: columns +:::: {.column width=42%} +![](figures/pandas_crosstabplot.png) +:::: +::: + +## Exercise 2 + +Read the file [\textcolor{violet}{heart.csv}](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/exercises/heart.csv) into a DataFrame. +[\textcolor{violet}{Information on the dataset}](https://archive.ics.uci.edu/ml/datasets/heart+Disease) + +\setbeamertemplate{itemize item}{\color{red}$\square$} + + * Which columns do we have + + * Print the first 3 rows + + * Print the statistics summary and the correlations + + * Print mean values for each column with and without disease + + * Select the data according to `sex` and `target` (heart disease 0=no 1=yes). + + * Plot the `age` distribution of male and female in one histogram + + * Plot the heart disease distribution according to chest pain type `cp` + + * Plot `thalach` according to `target` in one histogramm + + * Plot `sex` and `target` in a histogramm figure + + * Correlate `age` and `max heart rate` according to `target` + + * Correlate `age` and `colesterol` according to `target` + + \small + [Solution: 01_intro_ex_2_sol.py](https://www.physi.uni-heidelberg.de/~reygers/lectures/2021/ml/solutions/01_intro_ex_2_sol.py) \normalsize + + + + + + + diff --git a/slides/ml_basics.md b/slides/ml_basics.md new file mode 100644 index 0000000..d7bef87 --- /dev/null +++ b/slides/ml_basics.md @@ -0,0 +1,1157 @@ +--- +title: | + | Introduction to Data Analysis and Machine Learning in Physics: + | 3. Machine Learning Basics + +author: "Martino Borsato, Jörg Marks, Klaus Reygers" +date: "Studierendentage, 11-14 April 2022" +--- + +## Exercises + +* Exercise 1: Air shower classification (MAGIC telescope) + * Logistic regression + * [`03_ml_basics_ex01_magic.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/03_ml_basics_ex_1_magic.ipynb) +* Exercise 2: Hand-written digit recognition with logistic regression + * Logistic regression + * [`03_ml_basics_ex02_mnist_softmax_regression.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/03_ml_basics_ex_2_mnist_softmax_regression.ipynb) +* Exercise 3: Data preprocessing + +## What is machine learning? (1) +![](figures/deepl.png) + +## What is machine learning? (2) +"Machine learning is the subfield of computer science that gives computers the ability to learn without being explicitly programmed" -- Wikipedia + +\vspace{2ex} +Example: spam detection \hfill +\scriptsize [\textcolor{gray}{J. Mayes, Machine learning 101}](https://docs.google.com/presentation/d/1kSuQyW5DTnkVaZEjGYCkfOxvzCqGEFzWBy4e9Uedd9k/preview?imm_mid=0f9b7e&cmp=em-data-na-na-newsltr_20171213&slide=id.g168a3288f7_0_58) +\normalsize + +\begin{center} +\includegraphics[width=0.9\textwidth]{figures/ml_example_spam.png} +\vspace{2ex} + +Manual feature engineering vs. automatic feature detection +\end{center} + +## AI, ML, and DL +"AI is the study of how to make computers perform things that, at the moment, people do better." +\tiny \textcolor{gray}{Elaine Rich, Artificial intelligence, McGraw-Hill 1983} \normalsize +\vfill +\tiny \hfill \textcolor{gray}{G. Marcus, E. Davis, Rebooting AI} \normalsize +\begin{figure} +\centering +%![](figures/ai_ml_dl.pdf){width=70%} +\includegraphics[width=0.7\textwidth]{figures/ai_ml_dl.pdf} +\end{figure} + +\vfill +"deep" in deep learning: artificial neural nets with many neurons and multiple layers of nonlinear processing units for feature extraction + +## Multivariate analysis: An early example from particle physics +::: columns +:::: {.column width=55%} +![](figures/mva.png){width=99%} +:::: +:::: {.column width=45%} +* Signal: $e^+e^- \to W^+W^-$ + * often 4 well separated hadron jets +* Background: $e^+e^- \to qqgg$ + * 4 less well separated hadron jets +* Input variables based on jet structure, event shape, ... none by itself gives much separation. +![](figures/mva_nn.png){width=85%} +\tiny \textcolor{gray}{(Garrido, Juste and Martinez, ALEPH 96-144)} \normalsize +:::: +::: + +## Applications of machine learning in physics + +* Particle physics: Particle identification / classification +* Astronomy: Galaxy morphology classification +* Chemistry and material science: predict properties of new molecules / materials +* Many-body quantum matter: classification of quantum phases + +\vspace{3ex} +\scriptsize [\textcolor{gray}{Machine learning and the physical sciences, arXiv:1903.10563}](https://arxiv.org/abs/1903.10563) \normalsize + +## Some successes and unsolved problems in AI +::: columns +:::: {.column width=50%} +![](figures/ai_history.png){width=85%} + +\tiny \textcolor{gray}{M. Woolridge, The road to conscious machines} \normalsize + +:::: +:::: {.column width=50%} + +Impressive progress in certain fields: + +\small +* Image recognition +* Speech recognition +* Recommendation systems +* Automated translation +* Analysis of medical data +\normalsize +\vfill + +How can we profit from these developments in physics? +:::: +::: + +## The deep learning hype -- why now? +Artificial neural networks are around for decades. Why did deep learning take off after 2012? + +\vspace{5ex} + +* Improved hardware -- graphical processing units [GPUs] +* Large data sets (e.g. images) distributed via the Internet +* Algorithmic advances + + +## Different modeling approaches + +* Simple mathematical representation like linear regression. Favored by statisticians. +* Complex deterministic models based on scientific understanding of the physical process. Favored by physicists. +* Complex algorithms to make predictions that are derived from a huge number of past examples (“machine learning” as developed in the field of computer science). These are often black boxes. +* Regression models that claim to reach causal conclusions. Used by economists. + +\tiny \textcolor{gray}{D. Spiegelhalter, The Art of Statistics – Learning from data} \normalsize + + +## Machine learning: The "hello world" problem +::: columns +:::: {.column width=45%} + +Recognition of handwritten digits + +* MNIST database (Modified National Institute of Standards and Technology database) +* 60,000 training images and 10,000 testing images labeled with correct answer +* 28 pixel x 28 pixel +* Algorithms have reached "near-human performance" +* Smallest error rate (2018): 0.18\% + +:::: +:::: {.column width=55%} +![](figures/mnist.png) + +\tiny +[\color{gray}{\texttt{https://en.wikipedia.org/wiki/MNIST\_database}}](https://en.wikipedia.org/wiki/MNIST_database) +\normalsize + +:::: +::: + +## Machine learning: Image recognition +ImageNet database + +* 14 million images, 22,000 categories +* Since 2010, the annual ImageNet Large Scale Visual Recognition Challenge (ILSVRC): 1.4 million images, 1000 categories +* In 2017, 29 of 38 competing teams got less than 5\% wrong + +\begin{figure} +\centering +\includegraphics[width=0.8\textwidth]{figures/imagenet.png} +\end{figure} + +## ImageNet: Large Scale Visual Recognition Challenge + +\begin{figure} +\centering +\includegraphics[width=0.8\textwidth]{figures/imagenet_challenge.png} +\end{figure} + +\vfill + +\scriptsize +\textcolor{gray}{O. Russakovsky et al, arXiv:1409.0575} +\normalsize + +## Adversarial attack + +\begin{figure} +\centering +\includegraphics[width=\textwidth]{figures/adversarial_attack.png} +\end{figure} + +\vspace{3ex} +\scriptsize [\textcolor{gray}{Ian J. Goodfellow, Jonathon Shlens, Christian Szegedy, arXiv:1412.6572v1}](https://arxiv.org/abs/1412.6572v1) \normalsize + +## Types of machine learning +::: columns +:::: {.column width=60%} +Reinforcement learning + +\small +* The machine ("the agent") predicts a scalar reward given once in a while +* Weak feedback +\normalsize + +:::: +:::: {.column width=35%} +\tiny [\textcolor{gray}{LeCun 2018, Power And Limits of Deep Learning}](https://www.youtube.com/watch?v=0tEhw5t6rhc) \normalsize +![](figures/videogame.png) +:::: +::: +\vfill +::: columns +:::: {.column width=60%} + +\vspace{1em} +Supervised learning + +\small +* The machine predicts a category based on labeled training data +* Medium feedback +\normalsize +:::: +:::: {.column width=35%} +![](figures/supervised_learning_car_plane.png) +:::: +::: +\vfill +::: columns +:::: {.column width=60%} + +\vspace{1em} +Unsupervised learning + +\small +* Describe/find hidden structure from "unlabeled" data +* Cluster data in different sub-groups with similar properties +\normalsize +:::: +:::: {.column width=35%} +![](figures/anomaly_detection.png) +:::: +::: + +## Books on machine learning (1) + +::: columns +:::: {.column width=85%} +Ian Goodfellow and Yoshua Bengio and Aaron Courville, \textit{Deep Learning}, free online [http://www.deeplearningbook.org/](http://www.deeplearningbook.org/) + +\vspace{8ex} + +Kevin Murphy, \textit{Probabilistic Machine Learning: An Introduction}, [draft pdf version](https://probml.github.io/pml-book/) + +\vspace{7ex} + +Aurelien Geron, \textit{Hands-On Machine Learning with Scikit-Learn and TensorFlow} + +:::: +:::: {.column width=15%} +![](figures/deep_learning_book.png){width=65%} + +\vspace{3ex} + +![](figures/book-murphy.png){width=65%} + +\vspace{3ex} + +![](figures/hands_on_machine_learning.png){width=65%} + +:::: +::: + +## Books on machine learning (2) + +::: columns +:::: {.column width=85%} +Francois Chollet, \textit{Deep Learning with Python} + +\vspace{10ex} + +Martin Erdmann, Jonas Glombitza, Gregor Kasieczka, Uwe Klemradt, \textit{Deep Learning for Physics Research} + +:::: +:::: {.column width=15%} +![](figures/deep_learning_with_python.png){width=65%} + +\vspace{3ex} + +![](figures/book_deep_learning_for_physics_research.png){width=65%} + +:::: +::: + +## Papers + +A high-bias, low-variance introduction to Machine Learning for physicists + +[https://arxiv.org/abs/1803.08823](https://arxiv.org/abs/1803.08823) + +\vspace{3ex} + +Machine learning and the physical sciences + +[https://arxiv.org/abs/1903.10563](https://arxiv.org/abs/1903.10563) + +## Supervised learning in a nutshell +* Supervised Machine Learning requires labeled training data, i.e., a training sample where for each event it is known whether it is a signal or background event. +* Each event is characterized by $n$ observables: $\vec x = (x_1, x_2, ..., x_n) \;$ \textcolor{gray}{"feature vector"} + +\begin{figure} +\centering +\raisebox{-0.5\height}{\includegraphics[width=0.69\textwidth]{figures/supervised_nutshell.png}} +\raisebox{-0.5\height}{\includegraphics[width=0.30\textwidth]{figures/loss_fct.png}} +\end{figure} + +* Design function $y(\vec x, \vec w)$ with adjustable parameters $\vec w$ +* Design a loss function +* Find best parameters which minimize loss + + + +## Supervised learning: classification and regression + +The codomain $Y$ of the function y: $X \to Y$ can be a set of labels or classes or a continuous domain, e.g., $\mathbb{R}$ + +\vfill + +* $Y$ = finite set of labels $\quad \to \quad$ \textcolor{red}{classification} + * binary classification: $Y = \{0,1\}$ + * multi-class classification: $Y = \{c_1, c_2, ..., c_n\}$ +* $Y$ = real numbers $\quad \to \quad$ \textcolor{red}{regression} + +\vfill + +\textcolor{gray}{"All the impressive achievements of deep learning amount to just curve fitting" \\[0.5cm]} +\footnotesize +\textcolor{gray}{J. Pearl, Turing Award Winner 2011\\} +\tiny +[\color{gray}{To Build Truly Intelligent Machines, Teach Them Cause and Effect, Quantamagazine}](https://www.quantamagazine.org/to-build-truly-intelligent-machines-teach-them-cause-and-effect-20180515/) +\normalsize + +## Classification: Learning decision boundaries + +\begin{figure} +\centering +\includegraphics{figures/decision_boundaries.png} +\end{figure} + +## Supervised learning: Training, validation, and test sample +* Decision boundary fixed with \textcolor{blue}{training sample} +* Performance on training sample becomes better with more iterations +* Danger of overtraining: Statistical fluctuations of the training sample will be learnt +* \textcolor{blue}{Validation sample} = independent labeled data set not used for training $\rightarrow$ check for overtraining +* Sign of overtraining: performance on validation sample becomes worse $\rightarrow$ Stop training when signs of overtraining are observed (early stopping) +* Performance: apply classifier to independent \textcolor{blue}{test sample} +* Often: test sample = validation sample (only small bias) + +## Supervised learning: Cross validation + +Rule of thumb if training data not expensive + +::: columns +:::: {.column width=60%} +* Training sample: 50% +* Validation sample: 25% +* Test sample: 25% + +\vspace{2ex} + +Cross validation (efficient use of scarce training data) + +* Split training sample in $k$ independent subset $T_k$ of the full sample $T$ +* Train on $T \setminus T_k$ resulting in $k$ different classifiers +* For each training event there is one classifier that didn't use this event for training +* Validation results are then combined +:::: +:::: {.column width=40%} +\textcolor{gray}{Often test sample = validation sample (bias is rather small)} + +\vspace{10ex} +![](figures/cross_val.png) +:::: +::: + +## Often used loss functions +::: columns +:::: {.column width=45%} +\textcolor{blue}{Square error loss}: + +* often used in regression + +:::: +:::: {.column width=55%} +$$ E(y(\vec x, \vec w), t) = (y(\vec x, \vec w) - t)^2 $$ +:::: +::: + +\vfill + +::: columns +:::: {.column width=45%} +\textcolor{blue}{Cross entropy}: + +* $t \in \{0,1\}$ +* $y(\vec x, \vec w)$: predicted probability for outcome $t=1$ +* often used in classification + +:::: +:::: {.column width=55%} +\begin{align*} +E(y(\vec x, \vec w), t) = & - t \log y(\vec x, \vec w) \\ & - (1 - t) \log(1 - y(\vec x, \vec w)) +\end{align*} + +:::: +::: + +## More on entropy +* Self-information of an event $x$: $I(x) = - \log p(x)$ + * in units of **nats** (1 nat = information gained by observing an event of probability $1/e$) + +\vfill + +* Shannon entropy: $H(P) = - \sum p_i \log p_i$ + * Expected amount of information in an event drawn from a distribution $P$ + * Measure of the minimum of amount of bits needed on average to encode symbols drawn from a distribution + +\vfill + +* Cross entropy: $H(P,Q) = - E[\log Q] = - \sum p_i \log q_i$ + * Can be interpreted as a measure of the amount of bits needed when a wrong distribution Q is assumed while the data actually follows a distribution P + * Measure of dissimilarity between distributions P and Q (i.e, a measure of how well the model Q describes the true distribution P) + +## Hypothesis testing +::: columns +:::: {.column width=55%} +\includegraphics[width=\textwidth]{figures/signal_background_distr.png} +:::: +:::: {.column width=45%} +\vspace{2ex} +test statistic + +* a (usually scalar) variable which is a function of the data alone that can be used to test hypotheses +* example: $\chi^2$ w.r.t. a theory curve + +:::: +::: + +\textcolor{gray}{$\epsilon_\mathrm{B} \equiv \alpha$}: "background efficiency", i.e., prob. to misclassify bckg. as signal + +\textcolor{gray}{$\epsilon_\mathrm{S} \equiv 1 - \beta$}: "signal efficiency" + +\begin{center} +\begin{tabular}{ l l l} + & $H_0$ is true & $H_0$ is false (i.e., $H_1$ is true)\\ + \hline + $H_0$ is rejected & Type I error ($\alpha$) & Correct decision ($1 - \beta$) \\ + $H_0$ is not rejected & Correct decision ($1 - \alpha$) & Type II error ($\beta$) \\ + \hline +\end{tabular} +\end{center} + + +## Neyman-Pearson Lemma + +The likelihood ratio + +$$ t(\vec x) = \frac{f(\vec x|H_1)}{f(\vec x|H_0)} $$ + +is an optimal test statistic, i.e., it provides highest "signal efficiency" $1-\beta$ for a given "background efficiency" $\alpha$. Accept hypothesis if $t(\vec x) > c$. + +\vfill + +Problem: the underlying pdf's are almost never known explicitly. + +\vfill + +Two approaches + +1. Estimate signal and background pdf's and construct test statistic based on Neyman-Pearson lemma + +2. Decision boundaries determined directly without approximating the pdf's (linear discriminants, decision trees, neural networks, ...) + + +## Estimating PDFs from Histograms? + +\begin{center} +\includegraphics[width=0.8\textwidth]{figures/pdf_from_2d_histogram.png} +$\color{gray} \text{approximate PDF by} \; N(x,y|S) \; \text{and} \; N(x,y|B)$ +\end{center} + +$M$ bins per variable in $d$ dimensions: $M^d$ cells$\to$ hard to generate enough training data (often not practical for $d > 1$) + + +In general in machine learning, problems related to a large number of dimensions of the feature space are referred to as the \textcolor{red}{"curse of dimensionality"} + +## Na$\text{\"i}$ve Bayesian Classifier (also called "Projected Likelihood Classification") + +Application of the Neyman-Pearson lemma (ignoring correlations between the $x_i$): + +$$ f(x_1, x_2, ..., x_n) \quad \mbox{approximated as} \quad L = f_1(x_1) \cdot f_2(x_2) \cdot ... \cdot f_n(x_n) $$ +\begin{align*} +\mbox{where} \quad +f_1(x_1) & = \int \mathrm dx_2 \mathrm dx_3 ... \mathrm dx_n\; f(x_1, x_2, ..., x_n) \\ +f_2(x_2) & = \int \mathrm dx_1 \mathrm dx_3 ... \mathrm dx_n\; f(x_1, x_2, ..., x_n) \\ +\vdots +\end{align*} +Classification of feature vector $x$: +$$ +y(\vec x) = \frac{L_\mathrm{s}(\vec x)}{L_\mathrm{s}(\vec x) + L_\mathrm{b}(\vec x)} = \frac{1}{1 + L_\mathrm{b}(\vec x) / L_\mathrm{s}(\vec x)} +$$ + + +Performance not optimal if true PDF does not factorize + +## k-Nearest Neighbor Method (1) + +$k$-NN classifier: + +* Estimates probability density around the input vector +* $p(\vec x|S)$ and $p(\vec x|B)$ are approximated by the number of signal and background events in the training sample that lie in a small volume around the point $\vec x$ + +\vspace{2ex} + +Algorithms finds $k$ nearest neighbors: +$$ k = k_s + k_b $$ + +Probability for the event to be of signal type: + +$$ p_s(\vec x) = \frac{k_s(\vec x)}{k_s(\vec x) + k_b(\vec x)} $$ + +## k-Nearest Neighbor Method (2) + +::: columns +:::: {.column width=60%} +Simplest choice for distance measure in feature space is the Euclidean distance: +$$ R = |\vec x - \vec y|$$ + +Better: take correlations between variables into account: + +$$ R = \sqrt{(\vec{x}-\vec{y})^T \mat{V}^{-1} (\vec{x}-\vec{y})} $$ +$$ \mat{V} = \text{covariance matrix}, R = \text{"Mahalanobis distance"}$$ + + +:::: +:::: {.column width=40%} +![](figures/knn.png) +:::: +::: + +\vfill + +The $k$-NN classifier has best performance when the boundary that separates signal and background events has irregular features that cannot be easily approximated by parametric learning methods. + + +## Fisher Linear Discriminant + +Linear discriminant is simple. Can still be optimal if amount of training data is limited. + + +Ansatz for test statistic: $$ y(\vec x) = \sum_{i=1}^n w_i x_i = \vec w^\intercal \vec x $$ + + +Choose parameters $w_i$ so that separation between signal and background distribution is maximum. + +\vfill + +Need to define "separation". + + +::: columns +:::: {.column width=45%} +\begin{center} +Fisher: maximize $$ J(\vec w) = \frac{(\tau_s - \tau_b)^2}{\Sigma_s^2 + \Sigma_b^2} $$ +\end{center} +:::: +:::: {.column width=55%} +![](figures/fisher.png) +:::: +::: + +## Fisher Linear Discriminant: Determining the Coefficients $w_i$ + +::: columns +:::: {.column width=60%} +Coefficients are obtained from: $$ \frac{\partial J}{\partial w_i} = 0 $$ + +\vspace{2ex} + +Linear decision boundaries + +\vspace{5ex} + +Weight vector $\vec w$ can be interpreted as a direction in feature space onto which the events are projected. +:::: +:::: {.column width=40%} +![](figures/fisher_linear_decision_boundary.png) +:::: +::: + + + + +## Linear regression revisited + +\vfill + +::: columns +:::: {.column width=50%} +\small \textcolor{gray}{"Galton family heights data": \\ origin of the term "regression"} \normalsize +![](figures/03_ml_basics_galton_linear_regression_iminuit.pdf) + +:::: +:::: {.column width=50%} + +* data: $\{x_i,y_i\}$ \ +* objective: predict $y = f(x)$ +* model: $f(x; \vec \theta) = m x + b, \quad \vec \theta = (m, b)$ +* loss function: $J(\theta|x,y) = \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i))^2$ +* model training: optimal parameters $\hat{\vec{\theta}} = \mathrm{arg\,min} \, J(\vec \theta)$ + +:::: +::: + +## Linear regression + +* Data: vectors with $p$ components ("features"): $\vec x = (x_1, ..., x_p)$ +* $n$ observations: $\{\vec x_i, y_i\}, \quad i = 1, ..., n$ +* Prediction for given vector $x$: + $$ y = w_0 + w_1 x_1 + w_2 x_2 + ... + w_p x_p \equiv \vec w^\intercal \vec x \quad \text{where } x_0 := 1 $$ + +* Find weights that minimze loss function: + $$\hat{\vec{w}} = \underset{\vec w}{\min} \sum_{i=1}^{n} (\vec w^\intercal \vec x_i - y_i)^2$$ + +* In case of linear regression closed-form solution exists: + $$ \hat{\vec{w}} = (\mat{X}^\intercal \mat{X})^{-1} \mat{X}^\intercal \vec y \quad \text{where} \; X \in \mathbb{R}^{n \times p}$$ + +* $X$ is called the design matrix, row $i$ of $X$ is $\vec x_i$ + +## Linear regression with regularization + +::: columns +:::: {.column width=45%} +* Standard loss function + $$ C(\vec w) = \sum_{i=1}^{n} (\vec w^\intercal \vec x_i - y_i)^2 $$ + +* Ridge regression + $$ C(\vec w) = \sum_{i=1}^{n} (\vec w^\intercal \vec x_i - y_i)^2 + \lambda |\vec w|^2$$ + +* LASSO regression + $$ C(\vec w) = \sum_{i=1}^{n} (\vec w^\intercal \vec x_i - y_i)^2 + \lambda |\vec w| $$ + +:::: +:::: {.column width=55%} + +\vfill + +![](figures/L1vsL2.pdf) +\small \textcolor{gray}{LASSO regression tends to give sparse solutions (many components $w_j = 0$). This is why LASSO regression is also called sparse regression.} \normalsize +:::: +::: + +## Logistic regression (1) + +* Consider binary classification task, e.g., $y_i \in \{0,1\}$ +* Objective: Predict probability for outcome $y=1$ given an observation $\vec x$ +* Starting with linear "score" + $$ s = w_0 + w_1 x_1 + w_2 x_2 + ... + w_p x_p \equiv \vec w^\intercal \vec x$$ +* Define function that translates $s$ into a quantity that has the properties of a probability + $$ \sigma(s) = \frac{1}{1+e^{-s}} $$ +* We would like to determine the optimal weights for a given training data set. They result from the maximum-likelihood principle. + +## Logistic regression (2) + +* Consider feature vector $\vec x$. For a given set of weights $\vec w$ the model predicts + * a probability $p(1|\vec w) = \sigma(\vec w^\intercal \vec x)$ for outcome $y=1$ + * a probabiltiy $p(0|\vec w) = 1 - \sigma(\vec w^\intercal \vec x)$ for outcome $y=0$ +* The probability $p(y_i | \vec w)$ defines the likelihood $L_i(\vec w) = p(y_i | \vec w)$ (the likelihood is a function of the parameters $\vec w$ and the observations $y_i$ are fixed). +* Likelihood for the full data sample ($n$ observations) + $$ L(\vec w) = \prod_{i=1}^n L_i(\vec w) = \prod_{i=1}^n \sigma(\vec w^\intercal \vec x)^{y_i} \,(1-\sigma(\vec w^\intercal \vec x))^{1-y_i} $$ +* Maximizing the log-likelihood $\ln L(\vec w)$ corresponds to minimizing the loss function + $$ C(\vec w) = - \ln L(\vec w) = \sum_{i=1}^n - y_i \ln \sigma(\vec w^\intercal \vec x) - +(1-y_i) \ln(1-\sigma(\vec w^\intercal \vec x))$$ +* This is nothing else but the cross-entropy loss function + +## scikit-learn + +::: columns +:::: {.column width=70%} +* Free software machine learning library for Python +* Initial release: 2007 +* features various classification, regression and clustering algorithms including k-nearest neighbors, multi-layer perceptrons, support vector machines, random forests, gradient boosting, k-means +* Scikit-learn is one of the most popular machine learning libraries on GitHub +* [https://scikit-learn.org/](https://scikit-learn.org/) +:::: +:::: {.column width=30%} +\vspace{7ex} +\begin{figure} +\centering +\includegraphics[width=0.85\textwidth]{figures/scikit-learn.png} +\end{figure} +:::: +::: + + +## Example 1 - Probability of passing an exam (logistic regression) (1) + +Objective: predict the probability that someone passes an exam based on the number of hours studying + +$$ p_\mathrm{pass} = \sigma(s) = \frac{1}{1+e^{-s}}, \quad s = w_1 t + w_0, \quad t = \text{\# hours}$$ + +::: columns +:::: {.column width=40%} +* Data set: \ + * preparation $t$ time in hours + * passed / not passes (0/1) +* Parameters need to be determined through numerical minimization + * $w_0 = -4.0777$ + * $w_1 = 1.5046$ + + +\vspace{1.5ex} +\footnotesize +[\textcolor{gray}{03\_ml\_basics\_logistic\_regression.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/03_ml_basics_logistic_regression.ipynb) +\normalsize +:::: +:::: {.column width=60%} +![](figures/03_ml_basics_logistic_regression.pdf){width=90%} +:::: +::: + +## Example 1 - Probability of passing an exam (logistic regression) (2) + +\footnotesize +\textcolor{gray}{Read data from file:} +```python +# data: 1. hours studies, 2. passed (0/1) +df = pd.read_csv(filename, engine='python', sep='\s+') +x_tmp = df['hours_studied'].values +x = np.reshape(x_tmp, (-1, 1)) +y = df['passed'].values +``` +\vfill +\textcolor{gray}{Fit the data:} +```python +from sklearn.linear_model import LogisticRegression +clf = LogisticRegression(penalty='none', fit_intercept=True) +clf.fit(x, y); +``` +\vfill +\textcolor{gray}{Calculate predictions:} +```python +hours_studied_tmp = np.linspace(0., 6., 1000) +hours_studied = np.reshape(hours_studied_tmp, (-1, 1)) +y_pred = clf.predict_proba(hours_studied) +``` +\normalsize + +## Precision and recall + +::: columns +:::: {.column width=50%} +\textcolor{blue}{Precision:}\ +Fraction of correctly classified instances among all instances that obtain a certain class label. + +$$ \text{precision} = \frac{\text{TP}}{\text{TP} + \text{FP}} $$ + +\begin{center} +\textcolor{gray}{"purity"} +\end{center} + +:::: +:::: {.column width=50%} +\textcolor{blue}{Recall:}\ +Fraction of positive instances that are correctly classified. +\vspace{2.9ex} + +$$ \text{recall} = \frac{\text{TP}}{\text{TP} + \text{FN}} $$ + +\begin{center} +\textcolor{gray}{"efficiency"} +\end{center} + +:::: +::: +\vfill +\begin{center} +\textcolor{gray}{TP: true positives, FP: false positives, FN: false negatives} +\end{center} + +## Example 2: Heart disease data set (logistic regression) (1) + +\scriptsize +\textcolor{gray}{Read data:} +```python +filename = "https://www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/data/heart.csv" +df = pd.read_csv(filename) +df +``` +\vfill +![](figures/heart_table.png){width=70%} +\normalsize +\vspace{1.5ex} +\footnotesize +[\textcolor{gray}{03\_ml\_basics\_log\_regr\_heart\_disease.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/03_ml_basics_log_regr_heart_disease.ipynb) +\normalsize + +## Example 2: Heart disease data set (logistic regression) (2) +\footnotesize + +\textcolor{gray}{Define array of labels and feature vectors} +```python +y = df['target'].values +X = df[[col for col in df.columns if col!="target"]] +``` +\vfill +\textcolor{gray}{Generate training and test data sets} +```python +from sklearn.model_selection import train_test_split +X_train, X_test, y_train, y_test + = train_test_split(X, y, test_size=0.5, shuffle=True) +``` +\vfill +\textcolor{gray}{Fit the model} +```python +from sklearn.linear_model import LogisticRegression +lr = LogisticRegression(penalty='none', + fit_intercept=True, max_iter=1000, tol=1E-5) +lr.fit(X_train, y_train) +``` +\normalsize + +## Example 2: Heart disease data set (logistic regression) (3) +\footnotesize +\textcolor{gray}{Test predictions on test data set:} +```python +from sklearn.metrics import classification_report +y_pred_lr = lr.predict(X_test) +print(classification_report(y_test, y_pred_lr)) +``` +\vfill +\textcolor{gray}{Output:} +``` + precision recall f1-score support + + 0 0.75 0.86 0.80 63 + 1 0.89 0.80 0.84 89 + + accuracy 0.82 152 + macro avg 0.82 0.83 0.82 152 +weighted avg 0.83 0.82 0.82 152 +``` + +## Example 2: Heart disease data set (logistic regression) (4) + +\textcolor{gray}{Compare to another classifier using the \textit{receiver operating characteristic} (ROC) curve} +\vfill +\textcolor{gray}{Let's take the random forest classifier} +\footnotesize +```python +from sklearn.ensemble import RandomForestClassifier +rf = RandomForestClassifier(max_depth=3) +rf.fit(X_train, y_train) +``` +\normalsize +\vfill +\textcolor{gray}{Use \texttt{roc\_curve} from scikit-learn} +\footnotesize +```python +from sklearn.metrics import roc_curve + +y_pred_prob_lr = lr.predict_proba(X_test) # predicted probabilities +fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_prob_lr[:,1]) + +y_pred_prob_rf = rf.predict_proba(X_test) # predicted probabilities +fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_prob_rf[:,1]) + +``` +\normalsize + +## Example 2: Heart disease data set (logistic regression) (5) +::: columns +:::: {.column width=50%} +\scriptsize +```python +plt.plot(tpr_lr, 1-fpr_lr, label="log. regression") +plt.plot(tpr_rf, 1-fpr_rf, label="random forest") +``` +\vspace{5ex} + +\normalsize +\textcolor{gray}{Classifiers can be compared with the \textit{area under curve} (AUC) score.} +\scriptsize +```python +from sklearn.metrics import roc_auc_score +auc_lr = roc_auc_score(y_test,y_pred_lr) +auc_rf = roc_auc_score(y_test,y_pred_rf) +print(f"AUC scores: {auc_lr:.2f}, {auc_knn:.2f}") +``` +\vspace{5ex} +\normalsize +\textcolor{gray}{This gives} +\scriptsize +``` +AUC scores: 0.82, 0.83 +``` +\normalsize + +:::: +:::: {.column width=50%} +\begin{figure} +\centering +\includegraphics[width=0.96\textwidth]{figures/03_ml_basics_log_regr_heart_disease.pdf} +\end{figure} +:::: +::: + +## Multinomial logistic regression: Softmax function + +In the previous example we considered two classes (0, 1). For multi-class classification, the logistic function can generalized to the softmax function. +\vfill +Now consider $k$ classes and let $s_i$ be the score for class $i$: $\vec s = (s_1, ..., s_k)$ +\vfill +A probability for class $i$ can be predicted with the softmax function: + $$ \sigma(\vec s)_i = \frac{e^{s_i}}{\sum_{j=1}^k e^{s_j}} \quad \text{ for } \quad i = 1, ... , k $$ +The softmax functions is often used as the last activation function of a neural network in order to predict probabilities in a classification task. +\vfill +Multinomial logistic regression is also known as softmax regression. + +## Example 3: Iris data set (softmax regression) (1) + +Iris flower data set + +* Introduced 1936 in a paper by Ronald Fisher +* Task: classify flowers +* Three species: iris setosa, iris virginica and iris versicolor +* Four features: petal width and length, sepal width/length, in centimeters + +::: columns +:::: {.column width=40%} +\begin{figure} +\centering +\includegraphics[width=0.95\textwidth]{figures/iris_dataset.png} +\end{figure} +:::: +:::: {.column width=60%} + +\vspace{2ex} + +\footnotesize +[\textcolor{gray}{03\_ml\_basics\_iris\_softmax\_regression.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/03_ml_basics_iris_softmax_regression.ipynb) + +\vspace{19ex} + +\scriptsize +[https://archive.ics.uci.edu/ml/datasets/Iris](https://archive.ics.uci.edu/ml/datasets/Iris) + +[https://en.wikipedia.org/wiki/Iris_flower_data_set](https://en.wikipedia.org/wiki/Iris_flower_data_set) +\normalsize +:::: +::: + +## Example 3: Iris data set (softmax regression) (2) + +\textcolor{gray}{Get data set} +\footnotesize +```python +# import some data to play with +# columns: Sepal Length, Sepal Width, Petal Length and Petal Width +iris = datasets.load_iris() +X = iris.data +y = iris.target + +# split data into training and test data sets +x_train, x_test, y_train, y_test = + train_test_split(X, y, test_size=0.5, random_state=42) +``` +\normalsize +\vfill + +\textcolor{gray}{Softmax regression} +\footnotesize +```python +from sklearn.linear_model import LogisticRegression +log_reg = LogisticRegression(multi_class='multinomial', penalty='none') +log_reg.fit(x_train, y_train); +``` +\normalsize + +## Example 3 : Iris data set (softmax regression) (3) + +::: columns +:::: {.column width=70%} +\textcolor{gray}{Accuracy and confusion matrix for different classifiers} +\footnotesize +```python +for clf in [log_reg, kn_neigh, fisher_ld]: + y_pred = clf.predict(x_test) + acc = accuracy_score(y_test, y_pred) + print(type(clf).__name__) + print(f"accuracy: {acc:0.2f}") + + # confusion matrix: + # columns: true class, row: predicted class + print(confusion_matrix(y_test, y_pred),"\n") +``` +\normalsize +:::: +:::: {.column width=30%} + +\footnotesize +``` +LogisticRegression +accuracy: 0.96 +[[29 0 0] + [ 0 23 0] + [ 0 3 20]] + +KNeighborsClassifier +accuracy: 0.95 +[[29 0 0] + [ 0 23 0] + [ 0 4 19]] + +LinearDiscriminantAnalysis +accuracy: 0.99 +[[29 0 0] + [ 0 23 0] + [ 0 1 22]] +``` +\normalsize +:::: +::: + +## General remarks on multi-variate analyses (MVAs) + +* MVA Methods + * More effective than classic cut-based analyses + * Take correlations of input variables into account +\vfill +* Important: find good input variables for MVA methods + * Good separation power between S and B + * No strong correlation among variables + * No correlation with the parameters you try to measure in your signal sample! +\vfill +* Pre-processing + * Apply obvious variable transformations and let MVA method do the rest + * Make use of obvious symmetries: if e.g. a particle production process is symmetric in polar angle $\theta$ use $|\cos \theta|$ and not $\cos \theta$ as input variable + * It is generally useful to bring all input variables to a similar numerical range + +## Example of feature transformation + +\begin{figure} +\centering +\includegraphics[width=0.95\textwidth]{figures/feature_transformation.png} +\end{figure} + +## Exercise 1: Classification of air showers measured with the MAGIC telescope + +::: columns +:::: {.column width=50%} + +\small +* Cosmic gamma rays (30 GeV - 30 TeV). +* Cherenkov light from air showers +* Background: air showers caused by hadrons. +\normalsize + +\begin{figure} +\centering +\includegraphics[width=0.85\textwidth]{figures/magic_photo_small.png} +\end{figure} +:::: +:::: {.column width=50%} +![](figures/magic_sketch.png) +:::: +::: + +## Exercise 1: Classification of air showers measured with the MAGIC telescope +\begin{figure} +\centering +\includegraphics[width=0.75\textwidth]{figures/magic_shower_em_had_small.png} +\end{figure} +::: columns +:::: {.column width=50%} +\begin{center} +Gamma shower +\end{center} +:::: +:::: {.column width=50%} +\begin{center} +Hadronic shower +\end{center} +:::: +::: + +## Exercise 1: Classification of air showers measured with the MAGIC telescope +\begin{figure} +\centering +\includegraphics[width=0.95\textwidth]{figures/magic_shower_parameters.png} +\end{figure} + +## Exercise 1: Classification of air showers measured with the MAGIC telescope +MAGIC data set \ +\tiny +[\textcolor{gray}{https://archive.ics.uci.edu/ml/datasets/magic+gamma+telescope}](https://archive.ics.uci.edu/ml/datasets/magic+gamma+telescope) +\normalsize + +\scriptsize +``` +1. fLength: continuous # major axis of ellipse [mm] +2. fWidth: continuous # minor axis of ellipse [mm] +3. fSize: continuous # 10-log of sum of content of all pixels [in #phot] +4. fConc: continuous # ratio of sum of two highest pixels over fSize [ratio] +5. fConc1: continuous # ratio of highest pixel over fSize [ratio] +6. fAsym: continuous # dist. from highest pixel to center, proj. onto major axis [mm] +7. fM3Long: continuous # 3rd root of third moment along major axis [mm] +8. fM3Trans: continuous # 3rd root of third moment along minor axis [mm] +9. fAlpha: continuous # angle of major axis with vector to origin [deg] +10. fDist: continuous # distance from origin to center of ellipse [mm] +11. class: g,h # gamma (signal), hadron (background) + +g = gamma (signal): 12332 +h = hadron (background): 6688 + +For technical reasons, the number of h events is underestimated. +In the real data, the h class represents the majority of the events. +``` +\normalsize + +## Exercise 1: Classification of air showers measured with the MAGIC telescope + +\small +[\textcolor{gray}{03\_ml\_basics\_ex\_1\_magic.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/03_ml_basics_ex_1_magic.ipynb) +\normalsize + +a) Create for each variable a figure with a plot for gammas and hadrons overlayed. +b) Create training and test data set. The test data should amount to 50% of the total data set. +c) Define the logistic regressor and fit the training data +d) Determine the model accuracy and the AUC score +e) Plot the ROC curve (background rejection vs signal efficiency) + +## Exercise 2: Hand-written digit recognition with logistic regression + +\small +[\textcolor{gray}{03\_ml\_basics\_ex\_2\_mnist\_softmax\_regression.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/03_ml_basics_ex_2_mnist_softmax_regression.ipynb) +\normalsize + +a) Define logistic regressor from scikit-learn and fit data +b) Use \texttt{classification\_report} from scikit-learn to determine precision and recall +c) Read in a hand-written digit and classify it. Print the probabilities for each digit. Determine the digit with the highest probability. +d) (Optional) Create you own hand-written digit with a program like gimp and check what the classifier does + +\begin{figure} +\centering +\includegraphics[width=0.85\textwidth]{figures/handwritten_digits.png} +\end{figure} + +Hint: You can install required packages on the jupyter hub server like so: +\scriptsize +``` +!pip3 install --user pypng +``` +\normalsize + + +## Exercise 3: Data preprocessing + +a) Read the description of the [`sklearn.preprocessing`](https://scikit-learn.org/stable/modules/preprocessing.html) package. + +b) Start from the example notebook on the logistic regression for the heart disease data set ([03_ml_basics_log_regr_heart_disease.ipynb](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/03_ml_basics_log_regr_heart_disease.ipynb)). Pre-process the heart disease data set according to the given example. Does preprocessing make a difference in this case? + diff --git a/slides/neural_networks.md b/slides/neural_networks.md new file mode 100644 index 0000000..1ffcfc5 --- /dev/null +++ b/slides/neural_networks.md @@ -0,0 +1,808 @@ +--- +title: | + | Introduction to Data Analysis and Machine Learning in Physics: + | 5. Neural networks + +author: "Martino Borsato, Jörg Marks, Klaus Reygers" +date: "Studierendentage, 11-14 April 2022" +--- + +## Exercises + +* Exercise 1: Learn XOR with a MLP + * [`05_neural_networks_ex_1_xor.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_1_xor.ipynb) +* Exercise 2: Visualising decision boundaries of classifiers + * [`05_neural_networks_ex_2_decision_boundaries.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_2_decision_boundaries.ipynb) +* Exercise 3: Boston house prices (MLP regression) + * [`05_neural_networks_ex_3_boston_house_prices.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_3_boston_house_prices.ipynb) +* Exercise 4: Training a digit-classification neural network on the MNIST dataset using Keras + * [`05_neural_networks_ex_4_mnist_keras_train.ipynb`](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_4_mnist_keras_train.ipynb) + + +## Perceptron (1) + +::: columns +:::: {.column width=65%} +\begin{center} +\includegraphics[width=0.40\textwidth]{figures/perceptron_weighted_sum.png} +\vspace{1ex} +\includegraphics[width=0.75\textwidth]{figures/perceptron_retina.png} +\end{center} +:::: +:::: {.column width=35%} +$$h(\vec x) = \begin{cases}1 & \text{if }\ \vec w \cdot \vec x + b > 0,\\0 & \text{otherwise}\end{cases}$$ +\begin{center} +\includegraphics[width=0.95\textwidth]{figures/perceptron_photo.png} +\tiny +\textcolor{gray}{Mark 1 Perceptron. Frank Rosenblatt (1961)} +\normalsize +\end{center} +:::: +::: +\footnotesize +\vspace{2ex} +\textcolor{gray}{The perceptron was designed for image recognition. It was first implemented in hardware (400 photocells, weights = potentiometer settings).} +\normalsize + +## Perceptron (2) +::: columns +:::: {.column width=60%} +* McCulloch–Pitts (MCP) neuron (1943) + * First mathematical model of a biological neuron + * Boolean input + * Equal weights for all inputs + * Threshold hardcoded +* Improvements by Rosenblatt + * Different weights for inputs + * Algorithm to update weights and threshold given labeled training data + +\vfill + +Shortcoming of the perceptron: \newline +it cannot learn the XOR function \newline +\tiny \textcolor{gray}{Minsky, Papert, 1969} \normalsize + +:::: +:::: {.column width=40%} +![](figures/perceptron_with_threshold.png){width=80%} +![](figures/xor.png) +\small \textcolor{gray}{XOR: not linearly separable } \normalsize + +:::: +::: + +## The biological inspiration: the neuron + +\begin{figure} +\centering +\includegraphics[width=0.95\textwidth]{figures/neuron.png} +\end{figure} + +## Non-linear transfer / activation function + +Discriminant: $$ y(\vec x) = h\left( w_0 + \sum_{i=1}^n w_i x_i \right) $$ + +Examples for function $h$: \newline +$$ \frac{1}{1+e^{-x}} \; \text{("sigmoid" or "logistic" function)}, \quad \tanh x $$ + +::: columns +:::: {.column width=50%} +\begin{figure} +\centering +\includegraphics[width=0.75\textwidth]{figures/logistic_fct.png} +\end{figure} +:::: +:::: {.column width=50%} +\vspace{3ex} +Non-linear activation function needed in neural networks when feature space is not linearly separable. +\newline + +\small +\textcolor{gray}{Neural net with linear activation functions is just a perceptron} +\normalsize +:::: +::: + +## Feedforward neural network with one hidden layer +::: columns +:::: {.column width=60%} +![](figures/mlp.png){width=80%} +:::: +:::: {.column width=40%} +$$ \phi_i(\vec x) = h\left(w_{i0}^{(1)} + \sum_{j=1}^n w_{ij}^{(1)} x_j\right) $$ +\vfill +$$ y(\vec x) = h\left( w_{10}^{(2)} + \sum_{j=1}^m w_{1j}^{(2)} \phi_j(\vec x)\right) $$ +\vfill +\vspace{2ex} +\footnotesize +\textcolor{gray}{superscripts indicates layer number, i.e., $w_{ij}^{(1)}$ refers to the input weights of neuron $i$ in the hidden layer (= layer 1).} +\normalsize + +:::: +::: +\begin{center} +Straightforward to generalize to multiple hidden layers +\end{center} + +## Neural network output and decision boundaries +::: columns +:::: {.column width=75%} +\begin{figure} +\centering +\includegraphics[width=\textwidth]{figures/nn_decision_boundary.png} +\end{figure} +:::: +:::: {.column width=25%} +\vspace{3ex} +\footnotesize +\textcolor{gray}{P. Bhat, Multivariate Analysis Methods in Particle Physics, inspirehep.net/record/879273} +\normalsize +:::: +::: + +## Fun with neural nets in the browser +\begin{figure} +\centering +\includegraphics[width=\textwidth]{figures/tf_playground.png} +\end{figure} +\tiny +[\textcolor{gray}{http://playground.tensorflow.org}](http://playground.tensorflow.org) +\normalsize + +## Backpropagation (1) +Start with an initial guess $\vec w_0$ for the weights an then update weights after each training event: +$$ \vec w^{(\tau+1)} = \vec w^{(\tau)} - \eta \nabla E_a(\vec w^{(\tau)}), \quad \eta = \text{learning rate}$$ + +Gradient descent: +\begin{figure} +\centering +\includegraphics[width=0.46\textwidth]{figures/gradient_descent.png} +\end{figure} + +## Backpropagation (2) +::: columns +:::: {.column width=40%} +\vspace{6ex} +![](figures/mlp.png){width=100%} +:::: +:::: {.column width=60%} +Let's write network output as follows: +\begin{align*} +y(\vec x) &= h(u(\vec x)); \quad u(\vec x) = \sum_{j=0}^m w_{1j}^{(2)} \phi_j(\vec x) \\ +\phi_j(\vec x) &= h\left( \sum_{k=0}^n w_{jk}^{(1)} x_k\right) +\equiv h\left( v_j(\vec x) \right) +\end{align*} + +For $E_a = \frac{1}{2} (y_a - t_a)^2$ one obtains for the weights from hidden layer to output: +\begin{align*} +\frac{\partial E_a}{\partial w_{1j}^{(2)}} &= (y_a -t_a) h'(u(\vec x_a)) \frac{\partial u}{\partial w_{1j}^{(2)}} \\ +&= (y_a -t_a) h'(u(\vec x_a)) \phi_j(\vec x_a) +\end{align*} +:::: +::: +\vspace{2ex} +Further application of the chain rule gives weights from input to hidden layer. + +## Backpropagation (3) +Backpropagation summary + +* Make prediction for a given training instance (forward pass) +* Calculate error (value of loss function) +* Go backwards and determine the contribution of each weight (reverse pass) +* Adjust the weights to reduce the error + +\vfill + +Practical considerations: + +* Nowadays, people will implements neural networks with frameworks like Keras or TensorFlow +* No need to implement backpropagation yourself +* TensorFlow efficiently calculates gradient function based on a kind of symbolic differentiation + + +## More on gradient descent + +::: columns +:::: {.column width=60%} +* Stochastic gradient descent + * just uses one training event at a time + * fast, but quite irregular approach to the minimum + * can help escape local minima + * one can decrease learning rate to settle at the minimum ("simulated annealing") +* Batch gradient descent + * use entire training sample to calculate gradient of loss function + * computationally expensive +* Mini-batch gradient descent + * calculate gradient for a random sub-sample of the training set + +:::: +:::: {.column width=40%} +\begin{figure} +\centering +\includegraphics[width=0.7\textwidth]{figures/stochastic_gradient_descent.png} +\end{figure} +\begin{figure} +\centering +\includegraphics[width=\textwidth]{figures/gradient_descent_cmp.png} +\end{figure} +:::: +::: + +## Universal approximation theorem + +::: columns +:::: {.column width=60%} +"A feed-forward network with a single hidden layer containing a finite number of neurons (i.e., a multilayer perceptron), can approximate continuous functions on compact subsets of $\mathbb{R}^n$." + +\vspace{5ex} + +One of the first versions of the theorem was proved by George Cybenko in 1989 for sigmoid activation functions + +\vspace{5ex} + +The theorem does not touch upon the algorithmic learnability of those parameters + +:::: +:::: {.column width=40%} +\begin{figure} +\centering +\includegraphics[width=\textwidth]{figures/ann.png} +\end{figure} +:::: +::: + +## Deep neural networks +Deep networks: many hidden layers with large number of neurons + +::: columns +:::: {.column width=45%} +* Challenges + * Hard too train ("vanishing gradient problem") + * Training slow + * Risk of overtraining +:::: +:::: {.column width=55%} +* Big progress in recent years + * Interest in NN waned before ca. 2006 + * Milestone: paper by G. Hinton (2006): "learning for deep belief nets" + * Image recognition, AlphaGo, … + * Soon: self-driving cars, … +:::: +::: +\begin{figure} +\centering +\includegraphics[width=0.5\textwidth]{figures/dnn.png} +\end{figure} + +## Drawbacks of the sigmoid activation function + +::: columns +:::: {.column width=50%} +\includegraphics[width=.75\textwidth]{figures/sigmoid.png} +:::: +:::: {.column width=50%} +$$ \sigma(x) = \frac{1}{1 + e^{-x}} $$ +\vspace{3ex} + +* Saturated neurons “kill” the gradients +* Sigmoid outputs are not zero-centered +* exp() is a bit compute expensive +:::: +::: + +## Activation functions +\begin{figure} +\centering +\includegraphics[width=\textwidth]{figures/activation_functions.png} +\end{figure} + +## ReLU +::: columns +:::: {.column width=50%} +\includegraphics[width=.75\textwidth]{figures/relu.png} +:::: +:::: {.column width=50%} +$$ f(x) = \max(0,x) $$ +\vspace{1ex} + +* Does not saturate (in +region) +* Very computationally efficient +* Converges much faster than sigmoid tanh in practice +* Actually more biologically plausible than sigmoid +* But: gradient vanishes for $x < 0$ + +:::: +::: + + +## Bias-variance tradeoff (1) + +Goal: generalization of training data + +* Simple models (few parameters): danger of bias + * \textcolor{gray}{Classifiers with a small number of degrees of freedom are less prone to statistical fluctuations: different training samples would result in similar classification boundaries ("small variance")} +* Complex models (many parameters): danger of overfitting + * \textcolor{gray}{large variance of decision boundaries for different training samples} + +## Bias-variance tradeoff (2) +\begin{figure} +\centering +\includegraphics[trim=4cm 0cm 4cm 0cm, width=\textwidth]{figures/underfitting_overfitting.pdf} +\end{figure} + +## Example of overtraining +Too many neurons/layers make a neural network too flexible \newline $\to$ overtraining + +\begin{figure} +\centering +\includegraphics[width=0.9\textwidth]{figures/example_overtraining.png} +\end{figure} + +## Monitoring overtraining +Monitor fraction of misclassified events (or loss function:) +\begin{figure} +\centering +\includegraphics[width=0.8\textwidth]{figures/monitoring_overtraining.png} +\end{figure} + +## Regularization: Avoid overfitting +\scriptsize +[\hfill \textcolor{gray}{http://cs231n.stanford.edu/slides}](http://cs231n.stanford.edu/slides) +\normalsize +\begin{figure} +\centering +\includegraphics[width=0.75\textwidth]{figures/regularization.png} +\end{figure} +\begin{center} +$L_1$ regularization: $R(W) = \sum_k |W_k|$, $L_2$ regularization: $R(W) = \sum_k W_k^2$ +\end{center} + +## Another approach to prevent overfitting: Dropout +* Randomly remove nodes during training +* Avoid co-adaptation of nodes +\begin{figure} +\centering +\includegraphics[width=0.8\textwidth]{figures/dropout.png} +\end{figure} +\scriptsize +\textcolor{gray}{Srivastava et al.,} +[\textcolor{gray}{"Dropout: A Simple Way to Prevent Neural Networks from Overfitting"}](jmlr.org/papers/volume15/srivastava14a.old/srivastava14a.pdf) +\normalsize + + + +## Pros and cons of multi-layer perceptrons + +\textcolor{green}{Pros} + +* Capability to learn non-linear models + +\vspace{3ex} + +\textcolor{red}{Cons} + +* Loss function can have several local minima +* Hyperparameters need to be tuned + * \textcolor{gray}{number of layers, neurons per layer, and training iterations} +* Sensitive to feature scaling + * \textcolor{gray}{preprocessing needed (e.g., scaling of all feature to range [0,1])} + + +## Example 1: Boston house prices (MLP regression) (1) +* Objective: predict house prices in Boston suburbs in the mid-1970s +* Boston house data set: 506 instances, 13 features + +\footnotesize +``` + - CRIM per capita crime rate by town + - ZN proportion of residential land zoned for lots over 25,000 sq.ft. + - INDUS proportion of non-retail business acres per town + - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) + - NOX nitric oxides concentration (parts per 10 million) + - RM average number of rooms per dwelling + - AGE proportion of owner-occupied units built prior to 1940 + - DIS weighted distances to five Boston employment centres + - RAD index of accessibility to radial highways + - TAX full-value property-tax rate per $10,000 + - PTRATIO pupil-teacher ratio by town + - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town + - LSTAT % lower status of the population + - MEDV Median value of owner-occupied homes in $1000's +``` + +\footnotesize +[\textcolor{gray}{05\_neural\_networks\_boston\_house\_prices.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/05_neural_networks_boston_house_prices.ipynb) + +## Example 1: Boston house prices (MLP regression) (2) +```python +boston = datasets.load_boston() +X = boston.data +y = boston.target + +from sklearn.neural_network import MLPRegressor +mlp = MLPRegressor(hidden_layer_sizes=(100), + activation='logistic', random_state=1, max_iter=5000) +mlp.fit(X_train, y_train) + +y_pred_mlp = mlp.predict(X_test) + +rms = np.sqrt(mean_squared_error(y_test, y_pred_mlp)) +print(f"root mean square error {rms:.2f}") +``` + +## Example 1: Boston house prices (MLP regression) (3) +\begin{center} +\includegraphics[width=0.7\textwidth]{figures/boston_house_prices.pdf} +\end{center} + +## Exercise 1: XOR +\small +[\textcolor{gray}{05\_neural\_networks\_ex\_1\_xor.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_1_xor.ipynb) +\normalsize + +::: columns +:::: {.column width=60%} +a) Define a multi-layer perceptron classifier that learns the XOR problem. +\scriptsize +```python + from sklearn.neural_network import MLPClassifier + + X = [[0, 0], [0, 1], [1, 0], [1, 1]] + y = [0, 1, 1, 0] +``` +\normalsize +b) Define a multi-layer perceptron regressor that fits the depicted 2d data (see notebook). + +c) Plot the mean square error vs. the number of number of training epochs for b). +:::: +:::: {.column width=40%} +\vspace{10ex} +![](figures/xor_like_data.pdf) +:::: +::: + +## Exercise 2: Visualising decision boundaries of classifiers + +\small +[\textcolor{gray}{05\_neural\_networks\_ex\_2\_decision\_boundaries.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_2_decision_boundaries.ipynb) +\normalsize + +\vspace{5ex} + +Visualize the decision boundaries of a scikit-learn decision tree, a scikit-learn multi-layer perceptron, and XGBoost for different toy data sets. + + +## Exercise 3: Boston house prices (hyperparameter optimization) + +\small +[\textcolor{gray}{05\_neural\_networks\_ex\_3\_boston\_house\_prices.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_3_boston_house_prices.ipynb) +\normalsize + +\vspace{5ex} + +a) Can you find better hyperparameters (number of hidden layers, neurons per layer, loss function, ...)? Try this first by hand. +b) Now use [\textcolor{gray}{sklearn.model\_selection.GridSearchCV}](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) to find optimal parameters. + +## TensorFlow + +::: columns +:::: {.column width=70%} + +* Powerful open source library with a focus on deep neural networks +* Performs computations of data flow graphs +* Takes care of computing gradients of the defined functions (\textit{automatic differentiation}) +* Computations in parallel on multiple CPUs or GPUs +* Developed by the Google Brain team +* Initial release in 2015 +* [https://www.tensorflow.org/](https://www.tensorflow.org/) + +:::: +:::: {.column width=30%} +\begin{center} +\includegraphics[width=0.7\textwidth]{figures/tensorflow.png} +\end{center} +:::: +::: + +## Keras + +::: columns +:::: {.column width=70%} + +* Open-source library providing high-level building blocks for developing deep-learning models +* Uses TensorFlow as \textit{backend engine} for low-level tensor manipulation (version 2.4) +* Part of TensorFlow core API since TensorFlow 1.4 release +* Over 375,000 individual users as of early-2020 +* Primary author: Fran\c{c}ois Chollet (Google engineer) +* [https://keras.io/](https://keras.io/) + +:::: +:::: {.column width=30%} +\begin{center} +\includegraphics[width=0.5\textwidth]{figures/keras.png} +\end{center} +:::: +::: + + + +## Example 2: Boston house prices with Keras + +\small +```python +from tensorflow.keras import models +from tensorflow.keras import layers + +model = models.Sequential() +model.add(layers.Dense(64, activation='relu', + input_shape=(train_data.shape[1],))) +model.add(layers.Dense(64, activation='relu')) +model.add(layers.Dense(1)) +model.compile(optimizer='rmsprop', loss='mse', metrics=['mae']) + +model.fit(partial_train_data, partial_train_targets, + epochs=num_epochs, batch_size=1, verbose=0) + +# Evaluate the model on the validation data +val_mse, val_mae = model.evaluate(val_data, val_targets, verbose=0) + +``` +\normalsize + +\footnotesize +[\textcolor{gray}{05\_neural\_networks\_boston\_keras.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/examples/05_neural_networks_boston_keras.ipynb) + +## Convolutional neutral networks (CNNs) +\begin{center} +\includegraphics[width=0.7\textwidth]{figures/cnn.png} +\end{center} +::: columns +:::: {.column width=80%} +* CNNs emerged from the study of the visual cortex +* Behind many deep learning successes +* Partially connected layers + * \textcolor{gray}{Fully connected layers impractical for large images (too many neurons, overfitting)} + * Key component: Convolutional layers + * \textcolor{gray}{Set of learnable filters} + * \textcolor{gray}{Low-level features at the first layers; high-level features a the end} +:::: +:::: {.column width=20%} +\small +\textcolor{gray}{Sliding $3 \times3$ filter} +![](figures/cnn_sliding_filter.png) +:::: +::: + +## Different types of layers in a CNN +::: columns +:::: {.column width=50%} +\small \textcolor{gray}{1. Convolutional layers} \newline +\includegraphics[width=0.9\textwidth]{figures/cnn_conv_layer.png} +:::: +:::: {.column width=50%} +\small \textcolor{gray}{3. Fully connected layers} \newline +\includegraphics[width=0.9\textwidth]{figures/cnn_fully_connected.png} +:::: +::: + +\vspace{3ex} + +::: columns +:::: {.column width=60%} +\vfill +\small \textcolor{gray}{2. Pooling layers} \newline +\includegraphics[width=\textwidth]{figures/cnn_pooling.png} +:::: +:::: {.column width=40%} +\textcolor{gray}{\footnotesize Afshine Amidi, Shervine Amidi} \ +[\textcolor{gray}{\footnotesize Convolutional Neural Networks cheatsheet}](https://github.com/afshinea/stanford-cs-230-deep-learning/blob/master/en/cheatsheet-convolutional-neural-networks.pdf) +:::: +::: + +## MNIST classification with a CNN in Keras +\footnotesize +```python +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Dense, Flatten, MaxPooling2D, Conv2D, Input + +# conv layer with 8 3x3 filters +model = Sequential( + [ + Input(shape=input_shape), + Conv2D(8, kernel_size=(3, 3), activation="relu"), + MaxPooling2D(pool_size=(2, 2)), + Flatten(), + Dense(16, activation="relu"), + Dense(num_classes, activation="softmax"), + ] +) + +model.summary() +``` +\normalsize + +## Defining the CNN in Keras (2) + +\footnotesize +``` +Model: "sequential_1" +_________________________________________________________________ +Layer (type) Output Shape Param # +================================================================= +conv2d_1 (Conv2D) (None, 26, 26, 8) 80 +_________________________________________________________________ +max_pooling2d_1 (MaxPooling2 (None, 13, 13, 8) 0 +_________________________________________________________________ +flatten_1 (Flatten) (None, 1352) 0 +_________________________________________________________________ +dense_2 (Dense) (None, 16) 21648 +_________________________________________________________________ +dense_3 (Dense) (None, 10) 170 +================================================================= +Total params: 21,898 +Trainable params: 21,898 +Non-trainable params: 0 +``` +\normalsize + +## Model definition +Using Keras, you have to `compile` a model, which means adding the loss function, the optimizer algorithm and validation metrics to your training setup. +\vspace{5ex} + +\footnotesize +```python +model.compile(loss="categorical_crossentropy", + optimizer="adam", + metrics=["accuracy"]) +``` +\normalsize + +## Model training + +\footnotesize +```python +from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping + +checkpoint = ModelCheckpoint( + filepath="mnist_keras_model.h5", + save_best_only=True, + verbose=1) +early_stopping = EarlyStopping(patience=2) + +history = model.fit(x_train, y_train, # Training data + batch_size=200, # Batch size + epochs=50, # Maximum number of training epochs + validation_split=0.5, # Use 50% of the train dataset for validation + callbacks=[checkpoint, early_stopping]) # Register callbacks +``` +\normalsize + +## Exercise 4: Training a digit-classification neural network on the MNIST dataset using Keras + +\small +[\textcolor{gray}{05\_neural\_networks\_ex\_4\_mnist\_keras\_train.ipynb}](https://nbviewer.jupyter.org/urls/www.physi.uni-heidelberg.de/~reygers/lectures/2022/ml/exercises/05_neural_networks_ex_4_mnist_keras_train.ipynb) +\normalsize + +\vspace{5ex} + +a) Plot training and validation loss as well as training and validation accuracy as a function of the number of epochs + +b) Determine the accuracy of the fully trained model. + +c) Create a second notebook that reads the trained model (`mnist_keras_model.h5`). Read `your_own_digit.png` and classify it. Create your own $28 \times 28$ pixel digits with a program like gimp and check how the model performs. + + + +## Practical advice -- Which algorithm to choose? +\textcolor{gray}{From Kaggle competitions:} + +\vspace{3ex} +Structured data: "High level" features that have meaning: + +* feature engineering + decision trees +* Random forests +* XGBoost + +\vspace{3ex} +Unstructured data: "Low level" features, no individual meaning: + +* deep neural networks +* e.g. image classification: convolutional NN + + +## Outlook: Autoencoders + +::: columns +:::: {.column width=50%} +* Unsupervised method based on neural networks to learn a representation of the input data +* Autoencoders learn to copy the input to the output layer + * low dimensional coding of the input in the central layer +* The decoder generates data based on the coding (*generative model*) +* Applications + * Dimensionality reduction + * Denoising of data + * Machine translation +:::: +:::: {.column width=50%} +\vspace{3ex} +\begin{center} +\includegraphics[width=\textwidth]{figures/autoencoder_example.pdf} +\end{center} +:::: +::: + +## Outlook: Generative adversarial network (GANs) + +\begin{center} +\includegraphics[width=0.65\textwidth]{figures/gan.png} +\end{center} +\scriptsize +[\textcolor{gray}{https://developers.google.com/machine-learning/gan/gan\_structure}](https://developers.google.com/machine-learning/gan/gan_structure) +\normalsize + +* Discriminator's classification provides a signal that the generator uses to update its weights +* Application in particle physics: fast detector simulation +* Full GEANT simulation usually very CPU intensive + +## The future + +"Das Interessante an unserer Intelligenz ist, dass wir Go spielen können und dann vom Tisch aufstehen und Essen machen können, was eine Maschine nicht kann." + +\vspace{2ex} + +\color{gray} +\small +\hfill Bernhard Schölkopf, Max-Planck-Institut für intelligente Systeme ([Interview FAZ](https://www.faz.net/aktuell/wirtschaft/kuenstliche-intelligenz/ki-fachmann-wie-gut-europa-in-der-forschung-aufgestellt-ist-16650700.html)) +\normalsize +\color{black} + +\vfill + +"My view is throw it all away and start again" + +\color{gray} +\small +\hfill Geoffrey Hinton (DNN pioneer) on deep neural networks and backpropagation ([Interview, 2017](https://www.axios.com/artificial-intelligence-pioneer-says-we-need-to-start-over-1513305524-f619efbd-9db0-4947-a9b2-7a4c310a28fe.html)) +\normalsize +\color{black} + + diff --git a/slides/run_pandoc.sh b/slides/run_pandoc.sh new file mode 100755 index 0000000..5be9eea --- /dev/null +++ b/slides/run_pandoc.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +# run pandoc for a specific file + +pandoc --pdf-engine=xelatex --variable mainfont="Helvetica" --variable sansfont="Helvetica" -t beamer -s -fmarkdown-implicit_figures --template=template.beamer 05_neural_networks.md -o neural_networks.pdf diff --git a/slides/template.beamer b/slides/template.beamer new file mode 100644 index 0000000..c60ca9c --- /dev/null +++ b/slides/template.beamer @@ -0,0 +1,259 @@ +\documentclass[aspectratio=169,$if(fontsize)$$fontsize$,$endif$$if(lang)$$babel-lang$,$endif$$if(handout)$handout,$endif$$if(beamer)$ignorenonframetext,$endif$$for(classoption)$$classoption$$sep$,$endfor$]{$documentclass$} +\setbeamertemplate{caption}[numbered] +\setbeamertemplate{caption label separator}{: } +\setbeamertemplate{itemize item}[circle] +% \setbeamertemplate{itemize item}{\raisebox{0.1em}{\scalebox{0.6}{$$\blacksquare$$}}} +\setbeamertemplate{itemize subitem}{\raisebox{0.2em}{\scalebox{.7}{$$\blacktriangleright$$}}} +\setbeamercolor{caption name}{fg=normal text.fg} +\beamertemplatenavigationsymbols$if(navigation)$$navigation$$else$empty$endif$ +$if(fontfamily)$ +\usepackage[$for(fontfamilyoptions)$$fontfamilyoptions$$sep$,$endfor$]{$fontfamily$} +$else$ +\usepackage{lmodern} +$endif$ +\usepackage{amssymb,amsmath} +\usepackage{ifxetex,ifluatex} +\usepackage{fixltx2e} % provides \textsubscript +\usepackage{amsbsy} +\usepackage{bm} +\renewcommand*{\vec}[1]{\bm{#1}} +\newcommand*{\mat}[1]{\bm{#1}} +\setbeamertemplate{footline}{\hspace{155mm}\insertframenumber\vspace{1mm}\hspace{10mm}} +\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex + \usepackage[$if(fontenc)$$fontenc$$else$T1$endif$]{fontenc} + \usepackage[utf8]{inputenc} +$if(euro)$ + \usepackage{eurosym} +$endif$ +\else % if luatex or xelatex + \ifxetex + \usepackage{mathspec} + \else + \usepackage{fontspec} + \fi + \defaultfontfeatures{Ligatures=TeX,Scale=MatchUppercase} +$if(euro)$ + \newcommand{\euro}{€} +$endif$ +$if(mainfont)$ + \setmainfont[$for(mainfontoptions)$$mainfontoptions$$sep$,$endfor$]{$mainfont$} +$endif$ +$if(sansfont)$ + \setsansfont[$for(sansfontoptions)$$sansfontoptions$$sep$,$endfor$]{$sansfont$} +$endif$ +$if(monofont)$ + \setmonofont[Mapping=tex-ansi$if(monofontoptions)$,$for(monofontoptions)$$monofontoptions$$sep$,$endfor$$endif$]{$monofont$} +$endif$ +$if(mathfont)$ + \setmathfont(Digits,Latin,Greek)[$for(mathfontoptions)$$mathfontoptions$$sep$,$endfor$]{$mathfont$} +$endif$ +$if(CJKmainfont)$ + \usepackage{xeCJK} + \setCJKmainfont[$for(CJKoptions)$$CJKoptions$$sep$,$endfor$]{$CJKmainfont$} +$endif$ +\fi +$if(theme)$ +\usetheme{$theme$} +$endif$ +$if(colortheme)$ +\usecolortheme{$colortheme$} +$endif$ +$if(fonttheme)$ +\usefonttheme{$fonttheme$} +$endif$ +$if(mainfont)$ +\usefonttheme{serif} % use mainfont rather than sansfont for slide text +$endif$ +$if(innertheme)$ +\useinnertheme{$innertheme$} +$endif$ +$if(outertheme)$ +\useoutertheme{$outertheme$} +$endif$ +% use upquote if available, for straight quotes in verbatim environments +\IfFileExists{upquote.sty}{\usepackage{upquote}}{} +% use microtype if available +\IfFileExists{microtype.sty}{% +\usepackage{microtype} +\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts +}{} +$if(lang)$ +\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex + \usepackage[shorthands=off,$for(babel-otherlangs)$$babel-otherlangs$,$endfor$main=$babel-lang$]{babel} +$if(babel-newcommands)$ + $babel-newcommands$ +$endif$ +\else + \usepackage{polyglossia} + \setmainlanguage[$polyglossia-lang.options$]{$polyglossia-lang.name$} +$for(polyglossia-otherlangs)$ + \setotherlanguage[$polyglossia-otherlangs.options$]{$polyglossia-otherlangs.name$} +$endfor$ +\fi +$endif$ +\newif\ifbibliography +$if(natbib)$ +\usepackage{natbib} +\bibliographystyle{$if(biblio-style)$$biblio-style$$else$plainnat$endif$} +$endif$ +$if(biblatex)$ +\usepackage[$if(biblio-style)$style=$biblio-style$,$endif$$for(biblatexoptions)$$biblatexoptions$$sep$,$endfor$]{biblatex} +$for(bibliography)$ +\addbibresource{$bibliography$} +$endfor$ +$endif$ +$if(listings)$ +\usepackage{listings} +$endif$ +$if(lhs)$ +\lstnewenvironment{code}{\lstset{language=Haskell,basicstyle=\small\ttfamily}}{} +$endif$ +$if(highlighting-macros)$ +$highlighting-macros$ +$endif$ +$if(verbatim-in-note)$ +\usepackage{fancyvrb} +\VerbatimFootnotes % allows verbatim text in footnotes +$endif$ +$if(tables)$ +\usepackage{longtable,booktabs} +\usepackage{caption} +% These lines are needed to make table captions work with longtable: +\makeatletter +\def\fnum@table{\tablename~\thetable} +\makeatother +$endif$ +$if(graphics)$ +\usepackage{graphicx,grffile} +\makeatletter +\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi} +\def\maxheight{\ifdim\Gin@nat@height>\textheight0.8\textheight\else\Gin@nat@height\fi} +\makeatother +% Scale images if necessary, so that they will not overflow the page +% margins by default, and it is still possible to overwrite the defaults +% using explicit options in \includegraphics[width, height, ...]{} +\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio} +$endif$ + +% Prevent slide breaks in the middle of a paragraph: +\widowpenalties 1 10000 +\raggedbottom + +$if(section-titles)$ +\AtBeginPart{ + \let\insertpartnumber\relax + \let\partname\relax + \frame{\partpage} +} +\AtBeginSection{ + \ifbibliography + \else + \let\insertsectionnumber\relax + \let\sectionname\relax + \frame{\sectionpage} + \fi +} +\AtBeginSubsection{ + \let\insertsubsectionnumber\relax + \let\subsectionname\relax + \frame{\subsectionpage} +} +$endif$ + +$if(links-as-notes)$ +% Make links footnotes instead of hotlinks: +\renewcommand{\href}[2]{#2\footnote{\url{#1}}} +$endif$ +$if(strikeout)$ +\usepackage[normalem]{ulem} +% avoid problems with \sout in headers with hyperref: +\pdfstringdefDisableCommands{\renewcommand{\sout}{}} +$endif$ +\setlength{\emergencystretch}{3em} % prevent overfull lines +\providecommand{\tightlist}{% + \setlength{\itemsep}{1ex}\setlength{\parskip}{0pt}} +$if(numbersections)$ +\setcounter{secnumdepth}{5} +$else$ +\setcounter{secnumdepth}{0} +$endif$ +$if(dir)$ +\ifxetex + % load bidi as late as possible as it modifies e.g. graphicx + $if(latex-dir-rtl)$ + \usepackage[RTLdocument]{bidi} + $else$ + \usepackage{bidi} + $endif$ +\fi +\ifnum 0\ifxetex 1\fi\ifluatex 1\fi=0 % if pdftex + \TeXXeTstate=1 + \newcommand{\RL}[1]{\beginR #1\endR} + \newcommand{\LR}[1]{\beginL #1\endL} + \newenvironment{RTL}{\beginR}{\endR} + \newenvironment{LTR}{\beginL}{\endL} +\fi +$endif$ +$for(header-includes)$ +$header-includes$ +$endfor$ + +$if(title)$ +\title{$title$} +$endif$ +$if(subtitle)$ +\subtitle{$subtitle$} +$endif$ +$if(author)$ +\author{$for(author)$$author$$sep$ \and $endfor$} +$endif$ +$if(institute)$ +\institute{$for(institute)$$institute$$sep$ \and $endfor$} +$endif$ +\date{$date$} + +\begin{document} +$if(title)$ +\frame{\titlepage} +$endif$ + +$for(include-before)$ +$include-before$ + +$endfor$ +$if(toc)$ +\begin{frame} +\tableofcontents[hideallsubsections] +\end{frame} + +$endif$ +$body$ + +$if(natbib)$ +$if(bibliography)$ +$if(biblio-title)$ +$if(book-class)$ +\renewcommand\bibname{$biblio-title$} +$else$ +\renewcommand\refname{$biblio-title$} +$endif$ +$endif$ +\begin{frame}[allowframebreaks]{$biblio-title$} +\bibliographytrue +\bibliography{$for(bibliography)$$bibliography$$sep$,$endfor$} +\end{frame} + +$endif$ +$endif$ +$if(biblatex)$ +\begin{frame}[allowframebreaks]{$biblio-title$} +\bibliographytrue +\printbibliography[heading=none] +\end{frame} + +$endif$ +$for(include-after)$ +$include-after$ + +$endfor$ +\end{document}