ML-Kurs-SS2023/notebooks/01_intro_ex_2_sol.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Exercise 2: Example for pandas using the heart.csv data set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read the csv Data \n",
    "df = pd.read_csv('heart.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# What is the number of columns and rows\n",
    "print(df.columns)\n",
    "print (df.info())\n",
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get first 3 lines\n",
    "print(df.head(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#display statistics summary\n",
    "print(df.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#display correlation\n",
    "print (df.corr())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Print mean values for each column with and without disease\n",
    "print(df.groupby('target').mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get table with selection on more than 1 column\n",
    "df1 = df[(df[\"sex\"] == 0) & (df[\"target\"] == 0) ]\n",
    "print (df1.head(5))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " Plots"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# age dirtibution group into male and female (1 = male; 0 = female)\n",
    "# male\n",
    "plt.title('age distribution according to Sex') \n",
    "df[df[\"sex\"] == 1]['age'].plot.hist()\n",
    "print(df[df[\"sex\"] > 0]['age'])\n",
    "# female\n",
    "df[df[\"sex\"] == 0]['age'].plot.hist()\n",
    "plt.xlabel('age [years]')\n",
    "plt.legend([\"male\", \"female\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()\n",
    "# Plot maximum heart rate\n",
    "# Heart disease (0 = no, 1 = yes)\n",
    "plt.title('maximum heart rate according to heart disease') \n",
    "df[df[\"target\"] == 1]['thalach'].plot.hist()\n",
    "# no disease\n",
    "df[df[\"target\"] == 0]['thalach'].plot.hist()\n",
    "plt.legend([\"disease\", \"no disease\"])\n",
    "plt.xlabel('max heart rate')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Plot sex and target in one histogramm via crosstab\n",
    "pd.crosstab(df.sex,df.target).plot(kind=\"bar\",color=['red','blue' ])\n",
    "plt.title('Heart Disease distribution according to Sex')\n",
    "plt.xlabel('Sex (0 = Female, 1 = Male)')\n",
    "plt.legend([\"no disease\", \"disease\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Plot target and cp in one histogramm via crosstab\n",
    "pd.crosstab(df.cp,df.target).plot(kind=\"bar\",figsize=(15,6),color=['#11A5AA','#AA1190' ])\n",
    "plt.title('Heart Disease Distribution According To Chest Pain Type')\n",
    "plt.xlabel('Chest Pain Type')\n",
    "plt.xticks(rotation = 0)\n",
    "plt.ylabel('Frequency of Disease or Not')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot correlations for target\n",
    "plt.figure()\n",
    "plt.scatter(x=df.age[df.target==1], y=df.thalach[(df.target==1)], c=\"red\")\n",
    "plt.scatter(x=df.age[df.target==0], y=df.thalach[(df.target==0)])\n",
    "plt.title('Age-max Heart Rate Plot')\n",
    "plt.xlabel('age[years]')\n",
    "plt.ylabel('max. heart rate')\n",
    "plt.legend([\"Disease\", \"No Disease\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure()\n",
    "plt.scatter(x=df.age[df.target==1], y=df.chol[(df.target==1)], c=\"red\")\n",
    "plt.scatter(x=df.age[df.target==0], y=df.chol[(df.target==0)])\n",
    "plt.title('Age-Cholesterol Plot')\n",
    "plt.xlabel('age[years]')\n",
    "plt.ylabel('Cholesterol')\n",
    "plt.legend([\"Disease\", \"No Disease\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
update files 2023-04-03 13:08:49 +02:00			`{`
			`"cells": [`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`"Exercise 2: Example for pandas using the heart.csv data set"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"import numpy as np\n",`
			`"import pandas as pd\n",`
			`"import matplotlib.pyplot as plt"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# read the csv Data \n",`
			`"df = pd.read_csv('heart.csv')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"# What is the number of columns and rows\n",`
			`"print(df.columns)\n",`
			`"print (df.info())\n",`
			`"print(df.dtypes)"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"# get first 3 lines\n",`
			`"print(df.head(3))"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"#display statistics summary\n",`
			`"print(df.describe())"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"#display correlation\n",`
			`"print (df.corr())"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"# Print mean values for each column with and without disease\n",`
			`"print(df.groupby('target').mean())"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"# get table with selection on more than 1 column\n",`
			`"df1 = df[(df[\"sex\"] == 0) & (df[\"target\"] == 0) ]\n",`
			`"print (df1.head(5))"`
			`]`
			`},`
			`{`
			`"cell_type": "markdown",`
			`"metadata": {},`
			`"source": [`
			`" Plots"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"# age dirtibution group into male and female (1 = male; 0 = female)\n",`
			`"# male\n",`
			`"plt.title('age distribution according to Sex') \n",`
			`"df[df[\"sex\"] == 1]['age'].plot.hist()\n",`
			`"print(df[df[\"sex\"] > 0]['age'])\n",`
			`"# female\n",`
			`"df[df[\"sex\"] == 0]['age'].plot.hist()\n",`
			`"plt.xlabel('age [years]')\n",`
			`"plt.legend([\"male\", \"female\"])"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"plt.figure()\n",`
			`"# Plot maximum heart rate\n",`
			`"# Heart disease (0 = no, 1 = yes)\n",`
			`"plt.title('maximum heart rate according to heart disease') \n",`
			`"df[df[\"target\"] == 1]['thalach'].plot.hist()\n",`
			`"# no disease\n",`
			`"df[df[\"target\"] == 0]['thalach'].plot.hist()\n",`
			`"plt.legend([\"disease\", \"no disease\"])\n",`
			`"plt.xlabel('max heart rate')"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {`
			`"scrolled": true`
			`},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"# Plot sex and target in one histogramm via crosstab\n",`
			`"pd.crosstab(df.sex,df.target).plot(kind=\"bar\",color=['red','blue' ])\n",`
			`"plt.title('Heart Disease distribution according to Sex')\n",`
			`"plt.xlabel('Sex (0 = Female, 1 = Male)')\n",`
			`"plt.legend([\"no disease\", \"disease\"])"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {`
			`"scrolled": true`
			`},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"# Plot target and cp in one histogramm via crosstab\n",`
			`"pd.crosstab(df.cp,df.target).plot(kind=\"bar\",figsize=(15,6),color=['#11A5AA','#AA1190' ])\n",`
			`"plt.title('Heart Disease Distribution According To Chest Pain Type')\n",`
			`"plt.xlabel('Chest Pain Type')\n",`
			`"plt.xticks(rotation = 0)\n",`
			`"plt.ylabel('Frequency of Disease or Not')\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"# plot correlations for target\n",`
			`"plt.figure()\n",`
			`"plt.scatter(x=df.age[df.target==1], y=df.thalach[(df.target==1)], c=\"red\")\n",`
			`"plt.scatter(x=df.age[df.target==0], y=df.thalach[(df.target==0)])\n",`
			`"plt.title('Age-max Heart Rate Plot')\n",`
			`"plt.xlabel('age[years]')\n",`
			`"plt.ylabel('max. heart rate')\n",`
			`"plt.legend([\"Disease\", \"No Disease\"])"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
Update 2023-04-09 13:23:19 +02:00			`"execution_count": null,`
update files 2023-04-03 13:08:49 +02:00			`"metadata": {},`
Update 2023-04-09 13:23:19 +02:00			`"outputs": [],`
update files 2023-04-03 13:08:49 +02:00			`"source": [`
			`"plt.figure()\n",`
			`"plt.scatter(x=df.age[df.target==1], y=df.chol[(df.target==1)], c=\"red\")\n",`
			`"plt.scatter(x=df.age[df.target==0], y=df.chol[(df.target==0)])\n",`
			`"plt.title('Age-Cholesterol Plot')\n",`
			`"plt.xlabel('age[years]')\n",`
			`"plt.ylabel('Cholesterol')\n",`
			`"plt.legend([\"Disease\", \"No Disease\"])"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
			`}`
			`],`
			`"metadata": {`
			`"kernelspec": {`
			`"display_name": "Python 3 (ipykernel)",`
			`"language": "python",`
			`"name": "python3"`
			`},`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
			`"version": "3.8.16"`
			`}`
			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 4`
			`}`