From 5fad7c2bfb2a51cb09756a5458bf6438ae7d48d6 Mon Sep 17 00:00:00 2001 From: Sanjana Bankar <96869212+SanjanaBankar@users.noreply.github.com> Date: Sun, 19 May 2024 22:09:34 +0530 Subject: [PATCH] Add files via upload --- .../Hepatitis C Prediction.ipynb | 1262 +++++++++++++++++ 1 file changed, 1262 insertions(+) create mode 100644 Hepatitis C Prediction/Hepatitis C Prediction.ipynb diff --git a/Hepatitis C Prediction/Hepatitis C Prediction.ipynb b/Hepatitis C Prediction/Hepatitis C Prediction.ipynb new file mode 100644 index 00000000..f1bf9108 --- /dev/null +++ b/Hepatitis C Prediction/Hepatitis C Prediction.ipynb @@ -0,0 +1,1262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4e325665", + "metadata": {}, + "source": [ + "# HepatitisC Prediction | 1. Dataset Exploration\n", + "\n", + "-> Load the dataset\n", + "\n", + "-> Explore and confirm features and label(s) of this dataset\n", + "\n", + "-> Explore size/shape of dataset\n", + "\n", + "-> Investigate data type of features and labels and chose any better option for a \n", + "particular column for data type if possible\n", + "\n", + "-> Calculate the memory usage differences\n", + "\n", + "-> Explore the statistical facts like mean, median, x percentiles of the columns" + ] + }, + { + "cell_type": "markdown", + "id": "71f10b75", + "metadata": {}, + "source": [ + "## 1. Load the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "33aa9c5c", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.preprocessing import LabelEncoder\n", + "import warnings\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "df = pd.read_csv(\"HepatitisCdata.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "28b85f7c", + "metadata": {}, + "source": [ + "## 2. Explore and confirm features and label(s) of this dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "fe695f49", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0CategoryAgeSexALBALPALTASTBILCHECHOLCREAGGTPROT
010=Blood Donor32m38.552.57.722.17.56.933.23106.012.169.0
120=Blood Donor32m38.570.318.024.73.911.174.8074.015.676.5
230=Blood Donor32m46.974.736.252.66.18.845.2086.033.279.3
340=Blood Donor32m43.252.030.622.618.97.334.7480.033.875.7
450=Blood Donor32m39.274.132.624.89.69.154.3276.029.968.7
.............................................
6106113=Cirrhosis62f32.0416.65.9110.350.05.576.3055.7650.968.5
6116123=Cirrhosis64f24.0102.82.944.420.01.543.0263.035.971.3
6126133=Cirrhosis64f29.087.33.599.048.01.663.6366.764.282.0
6136143=Cirrhosis46f33.0NaN39.062.020.03.564.2052.050.071.0
6146153=Cirrhosis59f36.0NaN100.080.012.09.075.3067.034.068.0
\n", + "

615 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Category Age Sex ALB ALP ALT AST BIL \\\n", + "0 1 0=Blood Donor 32 m 38.5 52.5 7.7 22.1 7.5 \n", + "1 2 0=Blood Donor 32 m 38.5 70.3 18.0 24.7 3.9 \n", + "2 3 0=Blood Donor 32 m 46.9 74.7 36.2 52.6 6.1 \n", + "3 4 0=Blood Donor 32 m 43.2 52.0 30.6 22.6 18.9 \n", + "4 5 0=Blood Donor 32 m 39.2 74.1 32.6 24.8 9.6 \n", + ".. ... ... ... .. ... ... ... ... ... \n", + "610 611 3=Cirrhosis 62 f 32.0 416.6 5.9 110.3 50.0 \n", + "611 612 3=Cirrhosis 64 f 24.0 102.8 2.9 44.4 20.0 \n", + "612 613 3=Cirrhosis 64 f 29.0 87.3 3.5 99.0 48.0 \n", + "613 614 3=Cirrhosis 46 f 33.0 NaN 39.0 62.0 20.0 \n", + "614 615 3=Cirrhosis 59 f 36.0 NaN 100.0 80.0 12.0 \n", + "\n", + " CHE CHOL CREA GGT PROT \n", + "0 6.93 3.23 106.0 12.1 69.0 \n", + "1 11.17 4.80 74.0 15.6 76.5 \n", + "2 8.84 5.20 86.0 33.2 79.3 \n", + "3 7.33 4.74 80.0 33.8 75.7 \n", + "4 9.15 4.32 76.0 29.9 68.7 \n", + ".. ... ... ... ... ... \n", + "610 5.57 6.30 55.7 650.9 68.5 \n", + "611 1.54 3.02 63.0 35.9 71.3 \n", + "612 1.66 3.63 66.7 64.2 82.0 \n", + "613 3.56 4.20 52.0 50.0 71.0 \n", + "614 9.07 5.30 67.0 34.0 68.0 \n", + "\n", + "[615 rows x 14 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df# displaying the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "78b734a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Unnamed: 0', 'Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST',\n", + " 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT'],\n", + " dtype='object')" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.columns # displaying column names/features" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "8aa8f697", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0CategoryAgeSexALBALPALTASTBILCHECHOLCREAGGTPROT
010=Blood Donor32m38.552.57.722.17.56.933.23106.012.169.0
120=Blood Donor32m38.570.318.024.73.911.174.8074.015.676.5
230=Blood Donor32m46.974.736.252.66.18.845.2086.033.279.3
340=Blood Donor32m43.252.030.622.618.97.334.7480.033.875.7
450=Blood Donor32m39.274.132.624.89.69.154.3276.029.968.7
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Category Age Sex ALB ALP ALT AST BIL CHE \\\n", + "0 1 0=Blood Donor 32 m 38.5 52.5 7.7 22.1 7.5 6.93 \n", + "1 2 0=Blood Donor 32 m 38.5 70.3 18.0 24.7 3.9 11.17 \n", + "2 3 0=Blood Donor 32 m 46.9 74.7 36.2 52.6 6.1 8.84 \n", + "3 4 0=Blood Donor 32 m 43.2 52.0 30.6 22.6 18.9 7.33 \n", + "4 5 0=Blood Donor 32 m 39.2 74.1 32.6 24.8 9.6 9.15 \n", + "\n", + " CHOL CREA GGT PROT \n", + "0 3.23 106.0 12.1 69.0 \n", + "1 4.80 74.0 15.6 76.5 \n", + "2 5.20 86.0 33.2 79.3 \n", + "3 4.74 80.0 33.8 75.7 \n", + "4 4.32 76.0 29.9 68.7 " + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head() # displaying first 5 rows" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "3b02f06f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0CategoryAgeSexALBALPALTASTBILCHECHOLCREAGGTPROT
6106113=Cirrhosis62f32.0416.65.9110.350.05.576.3055.7650.968.5
6116123=Cirrhosis64f24.0102.82.944.420.01.543.0263.035.971.3
6126133=Cirrhosis64f29.087.33.599.048.01.663.6366.764.282.0
6136143=Cirrhosis46f33.0NaN39.062.020.03.564.2052.050.071.0
6146153=Cirrhosis59f36.0NaN100.080.012.09.075.3067.034.068.0
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 Category Age Sex ALB ALP ALT AST BIL CHE \\\n", + "610 611 3=Cirrhosis 62 f 32.0 416.6 5.9 110.3 50.0 5.57 \n", + "611 612 3=Cirrhosis 64 f 24.0 102.8 2.9 44.4 20.0 1.54 \n", + "612 613 3=Cirrhosis 64 f 29.0 87.3 3.5 99.0 48.0 1.66 \n", + "613 614 3=Cirrhosis 46 f 33.0 NaN 39.0 62.0 20.0 3.56 \n", + "614 615 3=Cirrhosis 59 f 36.0 NaN 100.0 80.0 12.0 9.07 \n", + "\n", + " CHOL CREA GGT PROT \n", + "610 6.30 55.7 650.9 68.5 \n", + "611 3.02 63.0 35.9 71.3 \n", + "612 3.63 66.7 64.2 82.0 \n", + "613 4.20 52.0 50.0 71.0 \n", + "614 5.30 67.0 34.0 68.0 " + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.tail() # displaying last 5 rows" + ] + }, + { + "cell_type": "markdown", + "id": "db48a2e6", + "metadata": {}, + "source": [ + "## 3. Explore size/shape of dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "fb2aeab3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(615, 14)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape # displaying shape of the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "7f74cf89", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "8610" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.size " + ] + }, + { + "cell_type": "markdown", + "id": "d698ae09", + "metadata": {}, + "source": [ + "## 4. Investigate data type of features and labels and chose any better option for a particular column for data type if possible" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "efe043b8", + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 615 entries, 0 to 614\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Unnamed: 0 615 non-null int64 \n", + " 1 Category 615 non-null object \n", + " 2 Age 615 non-null int64 \n", + " 3 Sex 615 non-null object \n", + " 4 ALB 614 non-null float64\n", + " 5 ALP 597 non-null float64\n", + " 6 ALT 614 non-null float64\n", + " 7 AST 615 non-null float64\n", + " 8 BIL 615 non-null float64\n", + " 9 CHE 615 non-null float64\n", + " 10 CHOL 605 non-null float64\n", + " 11 CREA 615 non-null float64\n", + " 12 GGT 615 non-null float64\n", + " 13 PROT 614 non-null float64\n", + "dtypes: float64(10), int64(2), object(2)\n", + "memory usage: 67.4+ KB\n" + ] + } + ], + "source": [ + "df.info() # displays information about the dataset, with respective datatypes for each column" + ] + }, + { + "cell_type": "markdown", + "id": "4f0e90b1", + "metadata": {}, + "source": [ + "Here we identified Columns that Need Encoding and Handling Columns:\n", + "\n", + "-> Category: This column represents categorical data. Encoding is necessary. -> Using One-Hot Encoding.\n", + "\n", + "-> Sex: This column represents binary categorical data. -> Using Binary Encoding via Label Encoding.\n", + "\n", + "-> Columns that Need Handling for Missing Values: ALB, ALP, ALT, CHOL, PROT" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "12e995d4", + "metadata": {}, + "outputs": [], + "source": [ + "if 'Unnamed: 0' in df.columns:\n", + " df = df.drop(columns=['Unnamed: 0'])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "a928887b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataFrame info after dropping Unnamed: 0:\n", + "\n", + "RangeIndex: 615 entries, 0 to 614\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Category 615 non-null object \n", + " 1 Age 615 non-null int64 \n", + " 2 Sex 615 non-null object \n", + " 3 ALB 614 non-null float64\n", + " 4 ALP 597 non-null float64\n", + " 5 ALT 614 non-null float64\n", + " 6 AST 615 non-null float64\n", + " 7 BIL 615 non-null float64\n", + " 8 CHE 615 non-null float64\n", + " 9 CHOL 605 non-null float64\n", + " 10 CREA 615 non-null float64\n", + " 11 GGT 615 non-null float64\n", + " 12 PROT 614 non-null float64\n", + "dtypes: float64(10), int64(1), object(2)\n", + "memory usage: 62.6+ KB\n" + ] + } + ], + "source": [ + "# Print info to confirm the drop\n", + "print(\"DataFrame info after dropping Unnamed: 0:\")\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "4b27ed99", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Null values before handling:\n", + "Category 0\n", + "Age 0\n", + "Sex 0\n", + "ALB 1\n", + "ALP 18\n", + "ALT 1\n", + "AST 0\n", + "BIL 0\n", + "CHE 0\n", + "CHOL 10\n", + "CREA 0\n", + "GGT 0\n", + "PROT 1\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "null_values_before = df.isnull().sum() # Checking for null values before handling\n", + "print(\"Null values before handling:\")\n", + "print(null_values_before)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "7dd5064b", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.get_dummies(df, columns=['Category'], prefix='Category') # Encode 'Category' using One-Hot Encoding and 'Sex' using Label Encoding\n", + "df['Sex'] = LabelEncoder().fit_transform(df['Sex'])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "8917eee9", + "metadata": {}, + "outputs": [], + "source": [ + "columns_with_missing_values = ['ALB', 'ALP', 'ALT', 'CHOL', 'PROT'] # Handling missing values: Impute with the mean\n", + "for column in columns_with_missing_values:\n", + " df[column].fillna(df[column].mean(), inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "ba38a86a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Null values after handling:\n", + "Age 0\n", + "Sex 0\n", + "ALB 0\n", + "ALP 0\n", + "ALT 0\n", + "AST 0\n", + "BIL 0\n", + "CHE 0\n", + "CHOL 0\n", + "CREA 0\n", + "GGT 0\n", + "PROT 0\n", + "Category_0=Blood Donor 0\n", + "Category_0s=suspect Blood Donor 0\n", + "Category_1=Hepatitis 0\n", + "Category_2=Fibrosis 0\n", + "Category_3=Cirrhosis 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "null_values_after = df.isnull().sum() # Checking for null values after handling\n", + "print(\"Null values after handling:\")\n", + "print(null_values_after)" + ] + }, + { + "cell_type": "markdown", + "id": "fa6c8dd9", + "metadata": {}, + "source": [ + "## 5. Calculate the memory usage differences" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "aee464f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Memory usage before processing: 0.06 MB\n", + "\n", + "RangeIndex: 615 entries, 0 to 614\n", + "Data columns (total 17 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Age 615 non-null int64 \n", + " 1 Sex 615 non-null int32 \n", + " 2 ALB 615 non-null float64\n", + " 3 ALP 615 non-null float64\n", + " 4 ALT 615 non-null float64\n", + " 5 AST 615 non-null float64\n", + " 6 BIL 615 non-null float64\n", + " 7 CHE 615 non-null float64\n", + " 8 CHOL 615 non-null float64\n", + " 9 CREA 615 non-null float64\n", + " 10 GGT 615 non-null float64\n", + " 11 PROT 615 non-null float64\n", + " 12 Category_0=Blood Donor 615 non-null bool \n", + " 13 Category_0s=suspect Blood Donor 615 non-null bool \n", + " 14 Category_1=Hepatitis 615 non-null bool \n", + " 15 Category_2=Fibrosis 615 non-null bool \n", + " 16 Category_3=Cirrhosis 615 non-null bool \n", + "dtypes: bool(5), float64(10), int32(1), int64(1)\n", + "memory usage: 58.4 KB\n" + ] + } + ], + "source": [ + "memory_usage_before = df.memory_usage(deep=True).sum()\n", + "print(f\"Memory usage before processing: {memory_usage_before / 1024 ** 2:.2f} MB\")\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "6b6aa90d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Memory usage after processing: 0.06 MB\n" + ] + } + ], + "source": [ + "memory_usage_after = df.memory_usage(deep=True).sum()\n", + "print(f\"Memory usage after processing: {memory_usage_after / 1024 ** 2:.2f} MB\")" + ] + }, + { + "cell_type": "markdown", + "id": "9d0f29a3", + "metadata": {}, + "source": [ + "## 6. Explore the statistical facts like mean, median, x percentiles of the columns" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "59fd3aad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AgeSexALBALPALTASTBILCHECHOLCREAGGTPROT
count615.000000615.000000615.000000615.000000615.000000615.000000615.000000615.000000615.000000615.000000615.000000615.000000
mean47.4081300.61300841.62019568.28392028.45081434.78634111.3967488.1966345.36809981.28780539.53317172.044137
std10.0551050.4874585.77592025.64395525.44894033.09069019.6731502.2056571.12346649.75616654.6610715.398234
min19.0000000.00000014.90000011.3000000.90000010.6000000.8000001.4200001.4300008.0000004.50000044.800000
10%34.0000000.00000035.64000042.74000011.90000018.1800003.7000005.6140004.03800061.00000011.90000066.300000
25%39.0000000.00000038.80000052.95000016.40000021.6000005.3000006.9350004.62000067.00000015.70000069.300000
50%47.0000001.00000041.90000066.70000023.00000025.9000007.3000008.2600005.31000077.00000023.30000072.200000
75%54.0000001.00000045.20000079.30000033.05000032.90000011.2000009.5900006.05500088.00000040.20000075.400000
99%71.0000001.00000053.258000137.158000118.086000187.32200087.64000013.8516008.580400134.826000292.53400082.686000
max77.0000001.00000082.200000416.600000325.300000324.000000254.00000016.4100009.6700001079.100000650.90000090.000000
\n", + "
" + ], + "text/plain": [ + " Age Sex ALB ALP ALT AST \\\n", + "count 615.000000 615.000000 615.000000 615.000000 615.000000 615.000000 \n", + "mean 47.408130 0.613008 41.620195 68.283920 28.450814 34.786341 \n", + "std 10.055105 0.487458 5.775920 25.643955 25.448940 33.090690 \n", + "min 19.000000 0.000000 14.900000 11.300000 0.900000 10.600000 \n", + "10% 34.000000 0.000000 35.640000 42.740000 11.900000 18.180000 \n", + "25% 39.000000 0.000000 38.800000 52.950000 16.400000 21.600000 \n", + "50% 47.000000 1.000000 41.900000 66.700000 23.000000 25.900000 \n", + "75% 54.000000 1.000000 45.200000 79.300000 33.050000 32.900000 \n", + "99% 71.000000 1.000000 53.258000 137.158000 118.086000 187.322000 \n", + "max 77.000000 1.000000 82.200000 416.600000 325.300000 324.000000 \n", + "\n", + " BIL CHE CHOL CREA GGT PROT \n", + "count 615.000000 615.000000 615.000000 615.000000 615.000000 615.000000 \n", + "mean 11.396748 8.196634 5.368099 81.287805 39.533171 72.044137 \n", + "std 19.673150 2.205657 1.123466 49.756166 54.661071 5.398234 \n", + "min 0.800000 1.420000 1.430000 8.000000 4.500000 44.800000 \n", + "10% 3.700000 5.614000 4.038000 61.000000 11.900000 66.300000 \n", + "25% 5.300000 6.935000 4.620000 67.000000 15.700000 69.300000 \n", + "50% 7.300000 8.260000 5.310000 77.000000 23.300000 72.200000 \n", + "75% 11.200000 9.590000 6.055000 88.000000 40.200000 75.400000 \n", + "99% 87.640000 13.851600 8.580400 134.826000 292.534000 82.686000 \n", + "max 254.000000 16.410000 9.670000 1079.100000 650.900000 90.000000 " + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe(percentiles=[0.1,0.25,0.5,0.75,0.99]) # describes numerical interpretations of the dataset in terms of mean, max, min, quartiles, etc.\n", + "# percentiles considered: 10, 25, 50, 75, 99" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}