2023-04-03 13:08:49 +02:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"Exercise 2: Example for pandas using the heart.csv data set"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import numpy as np\n",
|
|
|
|
"import pandas as pd\n",
|
|
|
|
"import matplotlib.pyplot as plt"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"# read the csv Data \n",
|
|
|
|
"df = pd.read_csv('heart.csv')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"# What is the number of columns and rows\n",
|
|
|
|
"print(df.columns)\n",
|
|
|
|
"print (df.info())\n",
|
|
|
|
"print(df.dtypes)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"# get first 3 lines\n",
|
|
|
|
"print(df.head(3))"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"#display statistics summary\n",
|
|
|
|
"print(df.describe())"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"#display correlation\n",
|
|
|
|
"print (df.corr())"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"# Print mean values for each column with and without disease\n",
|
|
|
|
"print(df.groupby('target').mean())"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"# get table with selection on more than 1 column\n",
|
|
|
|
"df1 = df[(df[\"sex\"] == 0) & (df[\"target\"] == 0) ]\n",
|
|
|
|
"print (df1.head(5))"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
" Plots"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"# age dirtibution group into male and female (1 = male; 0 = female)\n",
|
|
|
|
"# male\n",
|
|
|
|
"plt.title('age distribution according to Sex') \n",
|
|
|
|
"df[df[\"sex\"] == 1]['age'].plot.hist()\n",
|
|
|
|
"print(df[df[\"sex\"] > 0]['age'])\n",
|
|
|
|
"# female\n",
|
|
|
|
"df[df[\"sex\"] == 0]['age'].plot.hist()\n",
|
|
|
|
"plt.xlabel('age [years]')\n",
|
|
|
|
"plt.legend([\"male\", \"female\"])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"plt.figure()\n",
|
|
|
|
"# Plot maximum heart rate\n",
|
|
|
|
"# Heart disease (0 = no, 1 = yes)\n",
|
|
|
|
"plt.title('maximum heart rate according to heart disease') \n",
|
|
|
|
"df[df[\"target\"] == 1]['thalach'].plot.hist()\n",
|
|
|
|
"# no disease\n",
|
|
|
|
"df[df[\"target\"] == 0]['thalach'].plot.hist()\n",
|
|
|
|
"plt.legend([\"disease\", \"no disease\"])\n",
|
|
|
|
"plt.xlabel('max heart rate')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"# Plot sex and target in one histogramm via crosstab\n",
|
|
|
|
"pd.crosstab(df.sex,df.target).plot(kind=\"bar\",color=['red','blue' ])\n",
|
|
|
|
"plt.title('Heart Disease distribution according to Sex')\n",
|
|
|
|
"plt.xlabel('Sex (0 = Female, 1 = Male)')\n",
|
|
|
|
"plt.legend([\"no disease\", \"disease\"])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {
|
|
|
|
"scrolled": true
|
|
|
|
},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"# Plot target and cp in one histogramm via crosstab\n",
|
|
|
|
"pd.crosstab(df.cp,df.target).plot(kind=\"bar\",figsize=(15,6),color=['#11A5AA','#AA1190' ])\n",
|
|
|
|
"plt.title('Heart Disease Distribution According To Chest Pain Type')\n",
|
|
|
|
"plt.xlabel('Chest Pain Type')\n",
|
|
|
|
"plt.xticks(rotation = 0)\n",
|
|
|
|
"plt.ylabel('Frequency of Disease or Not')\n"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"# plot correlations for target\n",
|
|
|
|
"plt.figure()\n",
|
|
|
|
"plt.scatter(x=df.age[df.target==1], y=df.thalach[(df.target==1)], c=\"red\")\n",
|
|
|
|
"plt.scatter(x=df.age[df.target==0], y=df.thalach[(df.target==0)])\n",
|
|
|
|
"plt.title('Age-max Heart Rate Plot')\n",
|
|
|
|
"plt.xlabel('age[years]')\n",
|
|
|
|
"plt.ylabel('max. heart rate')\n",
|
|
|
|
"plt.legend([\"Disease\", \"No Disease\"])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-04-09 13:23:19 +02:00
|
|
|
"execution_count": null,
|
2023-04-03 13:08:49 +02:00
|
|
|
"metadata": {},
|
2023-04-09 13:23:19 +02:00
|
|
|
"outputs": [],
|
2023-04-03 13:08:49 +02:00
|
|
|
"source": [
|
|
|
|
"plt.figure()\n",
|
|
|
|
"plt.scatter(x=df.age[df.target==1], y=df.chol[(df.target==1)], c=\"red\")\n",
|
|
|
|
"plt.scatter(x=df.age[df.target==0], y=df.chol[(df.target==0)])\n",
|
|
|
|
"plt.title('Age-Cholesterol Plot')\n",
|
|
|
|
"plt.xlabel('age[years]')\n",
|
|
|
|
"plt.ylabel('Cholesterol')\n",
|
|
|
|
"plt.legend([\"Disease\", \"No Disease\"])"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": []
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3 (ipykernel)",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.8.16"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 4
|
|
|
|
}
|