ML-Kurs-SS2023/notebooks/01_intro_ex_2_sol.ipynb

227 lines
5.3 KiB
Plaintext
Raw Normal View History

2023-04-03 13:08:49 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Exercise 2: Example for pandas using the heart.csv data set"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
"outputs": [],
"source": [
"# read the csv Data \n",
"df = pd.read_csv('heart.csv')"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"# What is the number of columns and rows\n",
"print(df.columns)\n",
"print (df.info())\n",
"print(df.dtypes)"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"# get first 3 lines\n",
"print(df.head(3))"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"#display statistics summary\n",
"print(df.describe())"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"#display correlation\n",
"print (df.corr())"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"# Print mean values for each column with and without disease\n",
"print(df.groupby('target').mean())"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"# get table with selection on more than 1 column\n",
"df1 = df[(df[\"sex\"] == 0) & (df[\"target\"] == 0) ]\n",
"print (df1.head(5))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" Plots"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"# age dirtibution group into male and female (1 = male; 0 = female)\n",
"# male\n",
"plt.title('age distribution according to Sex') \n",
"df[df[\"sex\"] == 1]['age'].plot.hist()\n",
"print(df[df[\"sex\"] > 0]['age'])\n",
"# female\n",
"df[df[\"sex\"] == 0]['age'].plot.hist()\n",
"plt.xlabel('age [years]')\n",
"plt.legend([\"male\", \"female\"])"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"plt.figure()\n",
"# Plot maximum heart rate\n",
"# Heart disease (0 = no, 1 = yes)\n",
"plt.title('maximum heart rate according to heart disease') \n",
"df[df[\"target\"] == 1]['thalach'].plot.hist()\n",
"# no disease\n",
"df[df[\"target\"] == 0]['thalach'].plot.hist()\n",
"plt.legend([\"disease\", \"no disease\"])\n",
"plt.xlabel('max heart rate')"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {
"scrolled": true
},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"# Plot sex and target in one histogramm via crosstab\n",
"pd.crosstab(df.sex,df.target).plot(kind=\"bar\",color=['red','blue' ])\n",
"plt.title('Heart Disease distribution according to Sex')\n",
"plt.xlabel('Sex (0 = Female, 1 = Male)')\n",
"plt.legend([\"no disease\", \"disease\"])"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {
"scrolled": true
},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"# Plot target and cp in one histogramm via crosstab\n",
"pd.crosstab(df.cp,df.target).plot(kind=\"bar\",figsize=(15,6),color=['#11A5AA','#AA1190' ])\n",
"plt.title('Heart Disease Distribution According To Chest Pain Type')\n",
"plt.xlabel('Chest Pain Type')\n",
"plt.xticks(rotation = 0)\n",
"plt.ylabel('Frequency of Disease or Not')\n"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"# plot correlations for target\n",
"plt.figure()\n",
"plt.scatter(x=df.age[df.target==1], y=df.thalach[(df.target==1)], c=\"red\")\n",
"plt.scatter(x=df.age[df.target==0], y=df.thalach[(df.target==0)])\n",
"plt.title('Age-max Heart Rate Plot')\n",
"plt.xlabel('age[years]')\n",
"plt.ylabel('max. heart rate')\n",
"plt.legend([\"Disease\", \"No Disease\"])"
]
},
{
"cell_type": "code",
2023-04-09 13:23:19 +02:00
"execution_count": null,
2023-04-03 13:08:49 +02:00
"metadata": {},
2023-04-09 13:23:19 +02:00
"outputs": [],
2023-04-03 13:08:49 +02:00
"source": [
"plt.figure()\n",
"plt.scatter(x=df.age[df.target==1], y=df.chol[(df.target==1)], c=\"red\")\n",
"plt.scatter(x=df.age[df.target==0], y=df.chol[(df.target==0)])\n",
"plt.title('Age-Cholesterol Plot')\n",
"plt.xlabel('age[years]')\n",
"plt.ylabel('Cholesterol')\n",
"plt.legend([\"Disease\", \"No Disease\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.16"
}
},
"nbformat": 4,
"nbformat_minor": 4
}