From 5fad7c2bfb2a51cb09756a5458bf6438ae7d48d6 Mon Sep 17 00:00:00 2001
From: Sanjana Bankar <96869212+SanjanaBankar@users.noreply.github.com>
Date: Sun, 19 May 2024 22:09:34 +0530
Subject: [PATCH] Add files via upload
---
.../Hepatitis C Prediction.ipynb | 1262 +++++++++++++++++
1 file changed, 1262 insertions(+)
create mode 100644 Hepatitis C Prediction/Hepatitis C Prediction.ipynb
diff --git a/Hepatitis C Prediction/Hepatitis C Prediction.ipynb b/Hepatitis C Prediction/Hepatitis C Prediction.ipynb
new file mode 100644
index 00000000..f1bf9108
--- /dev/null
+++ b/Hepatitis C Prediction/Hepatitis C Prediction.ipynb
@@ -0,0 +1,1262 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "4e325665",
+ "metadata": {},
+ "source": [
+ "# HepatitisC Prediction | 1. Dataset Exploration\n",
+ "\n",
+ "-> Load the dataset\n",
+ "\n",
+ "-> Explore and confirm features and label(s) of this dataset\n",
+ "\n",
+ "-> Explore size/shape of dataset\n",
+ "\n",
+ "-> Investigate data type of features and labels and chose any better option for a \n",
+ "particular column for data type if possible\n",
+ "\n",
+ "-> Calculate the memory usage differences\n",
+ "\n",
+ "-> Explore the statistical facts like mean, median, x percentiles of the columns"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "71f10b75",
+ "metadata": {},
+ "source": [
+ "## 1. Load the dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "id": "33aa9c5c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from sklearn.preprocessing import LabelEncoder\n",
+ "import warnings\n",
+ "\n",
+ "warnings.filterwarnings(\"ignore\")\n",
+ "\n",
+ "df = pd.read_csv(\"HepatitisCdata.csv\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "28b85f7c",
+ "metadata": {},
+ "source": [
+ "## 2. Explore and confirm features and label(s) of this dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "id": "fe695f49",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " Category | \n",
+ " Age | \n",
+ " Sex | \n",
+ " ALB | \n",
+ " ALP | \n",
+ " ALT | \n",
+ " AST | \n",
+ " BIL | \n",
+ " CHE | \n",
+ " CHOL | \n",
+ " CREA | \n",
+ " GGT | \n",
+ " PROT | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0=Blood Donor | \n",
+ " 32 | \n",
+ " m | \n",
+ " 38.5 | \n",
+ " 52.5 | \n",
+ " 7.7 | \n",
+ " 22.1 | \n",
+ " 7.5 | \n",
+ " 6.93 | \n",
+ " 3.23 | \n",
+ " 106.0 | \n",
+ " 12.1 | \n",
+ " 69.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0=Blood Donor | \n",
+ " 32 | \n",
+ " m | \n",
+ " 38.5 | \n",
+ " 70.3 | \n",
+ " 18.0 | \n",
+ " 24.7 | \n",
+ " 3.9 | \n",
+ " 11.17 | \n",
+ " 4.80 | \n",
+ " 74.0 | \n",
+ " 15.6 | \n",
+ " 76.5 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 0=Blood Donor | \n",
+ " 32 | \n",
+ " m | \n",
+ " 46.9 | \n",
+ " 74.7 | \n",
+ " 36.2 | \n",
+ " 52.6 | \n",
+ " 6.1 | \n",
+ " 8.84 | \n",
+ " 5.20 | \n",
+ " 86.0 | \n",
+ " 33.2 | \n",
+ " 79.3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 0=Blood Donor | \n",
+ " 32 | \n",
+ " m | \n",
+ " 43.2 | \n",
+ " 52.0 | \n",
+ " 30.6 | \n",
+ " 22.6 | \n",
+ " 18.9 | \n",
+ " 7.33 | \n",
+ " 4.74 | \n",
+ " 80.0 | \n",
+ " 33.8 | \n",
+ " 75.7 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 0=Blood Donor | \n",
+ " 32 | \n",
+ " m | \n",
+ " 39.2 | \n",
+ " 74.1 | \n",
+ " 32.6 | \n",
+ " 24.8 | \n",
+ " 9.6 | \n",
+ " 9.15 | \n",
+ " 4.32 | \n",
+ " 76.0 | \n",
+ " 29.9 | \n",
+ " 68.7 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 610 | \n",
+ " 611 | \n",
+ " 3=Cirrhosis | \n",
+ " 62 | \n",
+ " f | \n",
+ " 32.0 | \n",
+ " 416.6 | \n",
+ " 5.9 | \n",
+ " 110.3 | \n",
+ " 50.0 | \n",
+ " 5.57 | \n",
+ " 6.30 | \n",
+ " 55.7 | \n",
+ " 650.9 | \n",
+ " 68.5 | \n",
+ "
\n",
+ " \n",
+ " 611 | \n",
+ " 612 | \n",
+ " 3=Cirrhosis | \n",
+ " 64 | \n",
+ " f | \n",
+ " 24.0 | \n",
+ " 102.8 | \n",
+ " 2.9 | \n",
+ " 44.4 | \n",
+ " 20.0 | \n",
+ " 1.54 | \n",
+ " 3.02 | \n",
+ " 63.0 | \n",
+ " 35.9 | \n",
+ " 71.3 | \n",
+ "
\n",
+ " \n",
+ " 612 | \n",
+ " 613 | \n",
+ " 3=Cirrhosis | \n",
+ " 64 | \n",
+ " f | \n",
+ " 29.0 | \n",
+ " 87.3 | \n",
+ " 3.5 | \n",
+ " 99.0 | \n",
+ " 48.0 | \n",
+ " 1.66 | \n",
+ " 3.63 | \n",
+ " 66.7 | \n",
+ " 64.2 | \n",
+ " 82.0 | \n",
+ "
\n",
+ " \n",
+ " 613 | \n",
+ " 614 | \n",
+ " 3=Cirrhosis | \n",
+ " 46 | \n",
+ " f | \n",
+ " 33.0 | \n",
+ " NaN | \n",
+ " 39.0 | \n",
+ " 62.0 | \n",
+ " 20.0 | \n",
+ " 3.56 | \n",
+ " 4.20 | \n",
+ " 52.0 | \n",
+ " 50.0 | \n",
+ " 71.0 | \n",
+ "
\n",
+ " \n",
+ " 614 | \n",
+ " 615 | \n",
+ " 3=Cirrhosis | \n",
+ " 59 | \n",
+ " f | \n",
+ " 36.0 | \n",
+ " NaN | \n",
+ " 100.0 | \n",
+ " 80.0 | \n",
+ " 12.0 | \n",
+ " 9.07 | \n",
+ " 5.30 | \n",
+ " 67.0 | \n",
+ " 34.0 | \n",
+ " 68.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
615 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 Category Age Sex ALB ALP ALT AST BIL \\\n",
+ "0 1 0=Blood Donor 32 m 38.5 52.5 7.7 22.1 7.5 \n",
+ "1 2 0=Blood Donor 32 m 38.5 70.3 18.0 24.7 3.9 \n",
+ "2 3 0=Blood Donor 32 m 46.9 74.7 36.2 52.6 6.1 \n",
+ "3 4 0=Blood Donor 32 m 43.2 52.0 30.6 22.6 18.9 \n",
+ "4 5 0=Blood Donor 32 m 39.2 74.1 32.6 24.8 9.6 \n",
+ ".. ... ... ... .. ... ... ... ... ... \n",
+ "610 611 3=Cirrhosis 62 f 32.0 416.6 5.9 110.3 50.0 \n",
+ "611 612 3=Cirrhosis 64 f 24.0 102.8 2.9 44.4 20.0 \n",
+ "612 613 3=Cirrhosis 64 f 29.0 87.3 3.5 99.0 48.0 \n",
+ "613 614 3=Cirrhosis 46 f 33.0 NaN 39.0 62.0 20.0 \n",
+ "614 615 3=Cirrhosis 59 f 36.0 NaN 100.0 80.0 12.0 \n",
+ "\n",
+ " CHE CHOL CREA GGT PROT \n",
+ "0 6.93 3.23 106.0 12.1 69.0 \n",
+ "1 11.17 4.80 74.0 15.6 76.5 \n",
+ "2 8.84 5.20 86.0 33.2 79.3 \n",
+ "3 7.33 4.74 80.0 33.8 75.7 \n",
+ "4 9.15 4.32 76.0 29.9 68.7 \n",
+ ".. ... ... ... ... ... \n",
+ "610 5.57 6.30 55.7 650.9 68.5 \n",
+ "611 1.54 3.02 63.0 35.9 71.3 \n",
+ "612 1.66 3.63 66.7 64.2 82.0 \n",
+ "613 3.56 4.20 52.0 50.0 71.0 \n",
+ "614 9.07 5.30 67.0 34.0 68.0 \n",
+ "\n",
+ "[615 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df# displaying the dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "id": "78b734a8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['Unnamed: 0', 'Category', 'Age', 'Sex', 'ALB', 'ALP', 'ALT', 'AST',\n",
+ " 'BIL', 'CHE', 'CHOL', 'CREA', 'GGT', 'PROT'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.columns # displaying column names/features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "8aa8f697",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " Category | \n",
+ " Age | \n",
+ " Sex | \n",
+ " ALB | \n",
+ " ALP | \n",
+ " ALT | \n",
+ " AST | \n",
+ " BIL | \n",
+ " CHE | \n",
+ " CHOL | \n",
+ " CREA | \n",
+ " GGT | \n",
+ " PROT | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0=Blood Donor | \n",
+ " 32 | \n",
+ " m | \n",
+ " 38.5 | \n",
+ " 52.5 | \n",
+ " 7.7 | \n",
+ " 22.1 | \n",
+ " 7.5 | \n",
+ " 6.93 | \n",
+ " 3.23 | \n",
+ " 106.0 | \n",
+ " 12.1 | \n",
+ " 69.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 0=Blood Donor | \n",
+ " 32 | \n",
+ " m | \n",
+ " 38.5 | \n",
+ " 70.3 | \n",
+ " 18.0 | \n",
+ " 24.7 | \n",
+ " 3.9 | \n",
+ " 11.17 | \n",
+ " 4.80 | \n",
+ " 74.0 | \n",
+ " 15.6 | \n",
+ " 76.5 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 0=Blood Donor | \n",
+ " 32 | \n",
+ " m | \n",
+ " 46.9 | \n",
+ " 74.7 | \n",
+ " 36.2 | \n",
+ " 52.6 | \n",
+ " 6.1 | \n",
+ " 8.84 | \n",
+ " 5.20 | \n",
+ " 86.0 | \n",
+ " 33.2 | \n",
+ " 79.3 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 0=Blood Donor | \n",
+ " 32 | \n",
+ " m | \n",
+ " 43.2 | \n",
+ " 52.0 | \n",
+ " 30.6 | \n",
+ " 22.6 | \n",
+ " 18.9 | \n",
+ " 7.33 | \n",
+ " 4.74 | \n",
+ " 80.0 | \n",
+ " 33.8 | \n",
+ " 75.7 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 0=Blood Donor | \n",
+ " 32 | \n",
+ " m | \n",
+ " 39.2 | \n",
+ " 74.1 | \n",
+ " 32.6 | \n",
+ " 24.8 | \n",
+ " 9.6 | \n",
+ " 9.15 | \n",
+ " 4.32 | \n",
+ " 76.0 | \n",
+ " 29.9 | \n",
+ " 68.7 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 Category Age Sex ALB ALP ALT AST BIL CHE \\\n",
+ "0 1 0=Blood Donor 32 m 38.5 52.5 7.7 22.1 7.5 6.93 \n",
+ "1 2 0=Blood Donor 32 m 38.5 70.3 18.0 24.7 3.9 11.17 \n",
+ "2 3 0=Blood Donor 32 m 46.9 74.7 36.2 52.6 6.1 8.84 \n",
+ "3 4 0=Blood Donor 32 m 43.2 52.0 30.6 22.6 18.9 7.33 \n",
+ "4 5 0=Blood Donor 32 m 39.2 74.1 32.6 24.8 9.6 9.15 \n",
+ "\n",
+ " CHOL CREA GGT PROT \n",
+ "0 3.23 106.0 12.1 69.0 \n",
+ "1 4.80 74.0 15.6 76.5 \n",
+ "2 5.20 86.0 33.2 79.3 \n",
+ "3 4.74 80.0 33.8 75.7 \n",
+ "4 4.32 76.0 29.9 68.7 "
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.head() # displaying first 5 rows"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "3b02f06f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Unnamed: 0 | \n",
+ " Category | \n",
+ " Age | \n",
+ " Sex | \n",
+ " ALB | \n",
+ " ALP | \n",
+ " ALT | \n",
+ " AST | \n",
+ " BIL | \n",
+ " CHE | \n",
+ " CHOL | \n",
+ " CREA | \n",
+ " GGT | \n",
+ " PROT | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 610 | \n",
+ " 611 | \n",
+ " 3=Cirrhosis | \n",
+ " 62 | \n",
+ " f | \n",
+ " 32.0 | \n",
+ " 416.6 | \n",
+ " 5.9 | \n",
+ " 110.3 | \n",
+ " 50.0 | \n",
+ " 5.57 | \n",
+ " 6.30 | \n",
+ " 55.7 | \n",
+ " 650.9 | \n",
+ " 68.5 | \n",
+ "
\n",
+ " \n",
+ " 611 | \n",
+ " 612 | \n",
+ " 3=Cirrhosis | \n",
+ " 64 | \n",
+ " f | \n",
+ " 24.0 | \n",
+ " 102.8 | \n",
+ " 2.9 | \n",
+ " 44.4 | \n",
+ " 20.0 | \n",
+ " 1.54 | \n",
+ " 3.02 | \n",
+ " 63.0 | \n",
+ " 35.9 | \n",
+ " 71.3 | \n",
+ "
\n",
+ " \n",
+ " 612 | \n",
+ " 613 | \n",
+ " 3=Cirrhosis | \n",
+ " 64 | \n",
+ " f | \n",
+ " 29.0 | \n",
+ " 87.3 | \n",
+ " 3.5 | \n",
+ " 99.0 | \n",
+ " 48.0 | \n",
+ " 1.66 | \n",
+ " 3.63 | \n",
+ " 66.7 | \n",
+ " 64.2 | \n",
+ " 82.0 | \n",
+ "
\n",
+ " \n",
+ " 613 | \n",
+ " 614 | \n",
+ " 3=Cirrhosis | \n",
+ " 46 | \n",
+ " f | \n",
+ " 33.0 | \n",
+ " NaN | \n",
+ " 39.0 | \n",
+ " 62.0 | \n",
+ " 20.0 | \n",
+ " 3.56 | \n",
+ " 4.20 | \n",
+ " 52.0 | \n",
+ " 50.0 | \n",
+ " 71.0 | \n",
+ "
\n",
+ " \n",
+ " 614 | \n",
+ " 615 | \n",
+ " 3=Cirrhosis | \n",
+ " 59 | \n",
+ " f | \n",
+ " 36.0 | \n",
+ " NaN | \n",
+ " 100.0 | \n",
+ " 80.0 | \n",
+ " 12.0 | \n",
+ " 9.07 | \n",
+ " 5.30 | \n",
+ " 67.0 | \n",
+ " 34.0 | \n",
+ " 68.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unnamed: 0 Category Age Sex ALB ALP ALT AST BIL CHE \\\n",
+ "610 611 3=Cirrhosis 62 f 32.0 416.6 5.9 110.3 50.0 5.57 \n",
+ "611 612 3=Cirrhosis 64 f 24.0 102.8 2.9 44.4 20.0 1.54 \n",
+ "612 613 3=Cirrhosis 64 f 29.0 87.3 3.5 99.0 48.0 1.66 \n",
+ "613 614 3=Cirrhosis 46 f 33.0 NaN 39.0 62.0 20.0 3.56 \n",
+ "614 615 3=Cirrhosis 59 f 36.0 NaN 100.0 80.0 12.0 9.07 \n",
+ "\n",
+ " CHOL CREA GGT PROT \n",
+ "610 6.30 55.7 650.9 68.5 \n",
+ "611 3.02 63.0 35.9 71.3 \n",
+ "612 3.63 66.7 64.2 82.0 \n",
+ "613 4.20 52.0 50.0 71.0 \n",
+ "614 5.30 67.0 34.0 68.0 "
+ ]
+ },
+ "execution_count": 26,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.tail() # displaying last 5 rows"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "db48a2e6",
+ "metadata": {},
+ "source": [
+ "## 3. Explore size/shape of dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "fb2aeab3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(615, 14)"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.shape # displaying shape of the dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "id": "7f74cf89",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "8610"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.size "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d698ae09",
+ "metadata": {},
+ "source": [
+ "## 4. Investigate data type of features and labels and chose any better option for a particular column for data type if possible"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "id": "efe043b8",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 615 entries, 0 to 614\n",
+ "Data columns (total 14 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Unnamed: 0 615 non-null int64 \n",
+ " 1 Category 615 non-null object \n",
+ " 2 Age 615 non-null int64 \n",
+ " 3 Sex 615 non-null object \n",
+ " 4 ALB 614 non-null float64\n",
+ " 5 ALP 597 non-null float64\n",
+ " 6 ALT 614 non-null float64\n",
+ " 7 AST 615 non-null float64\n",
+ " 8 BIL 615 non-null float64\n",
+ " 9 CHE 615 non-null float64\n",
+ " 10 CHOL 605 non-null float64\n",
+ " 11 CREA 615 non-null float64\n",
+ " 12 GGT 615 non-null float64\n",
+ " 13 PROT 614 non-null float64\n",
+ "dtypes: float64(10), int64(2), object(2)\n",
+ "memory usage: 67.4+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.info() # displays information about the dataset, with respective datatypes for each column"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4f0e90b1",
+ "metadata": {},
+ "source": [
+ "Here we identified Columns that Need Encoding and Handling Columns:\n",
+ "\n",
+ "-> Category: This column represents categorical data. Encoding is necessary. -> Using One-Hot Encoding.\n",
+ "\n",
+ "-> Sex: This column represents binary categorical data. -> Using Binary Encoding via Label Encoding.\n",
+ "\n",
+ "-> Columns that Need Handling for Missing Values: ALB, ALP, ALT, CHOL, PROT"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "id": "12e995d4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if 'Unnamed: 0' in df.columns:\n",
+ " df = df.drop(columns=['Unnamed: 0'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "id": "a928887b",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "DataFrame info after dropping Unnamed: 0:\n",
+ "\n",
+ "RangeIndex: 615 entries, 0 to 614\n",
+ "Data columns (total 13 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Category 615 non-null object \n",
+ " 1 Age 615 non-null int64 \n",
+ " 2 Sex 615 non-null object \n",
+ " 3 ALB 614 non-null float64\n",
+ " 4 ALP 597 non-null float64\n",
+ " 5 ALT 614 non-null float64\n",
+ " 6 AST 615 non-null float64\n",
+ " 7 BIL 615 non-null float64\n",
+ " 8 CHE 615 non-null float64\n",
+ " 9 CHOL 605 non-null float64\n",
+ " 10 CREA 615 non-null float64\n",
+ " 11 GGT 615 non-null float64\n",
+ " 12 PROT 614 non-null float64\n",
+ "dtypes: float64(10), int64(1), object(2)\n",
+ "memory usage: 62.6+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Print info to confirm the drop\n",
+ "print(\"DataFrame info after dropping Unnamed: 0:\")\n",
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "id": "4b27ed99",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Null values before handling:\n",
+ "Category 0\n",
+ "Age 0\n",
+ "Sex 0\n",
+ "ALB 1\n",
+ "ALP 18\n",
+ "ALT 1\n",
+ "AST 0\n",
+ "BIL 0\n",
+ "CHE 0\n",
+ "CHOL 10\n",
+ "CREA 0\n",
+ "GGT 0\n",
+ "PROT 1\n",
+ "dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "null_values_before = df.isnull().sum() # Checking for null values before handling\n",
+ "print(\"Null values before handling:\")\n",
+ "print(null_values_before)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "id": "7dd5064b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.get_dummies(df, columns=['Category'], prefix='Category') # Encode 'Category' using One-Hot Encoding and 'Sex' using Label Encoding\n",
+ "df['Sex'] = LabelEncoder().fit_transform(df['Sex'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "id": "8917eee9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "columns_with_missing_values = ['ALB', 'ALP', 'ALT', 'CHOL', 'PROT'] # Handling missing values: Impute with the mean\n",
+ "for column in columns_with_missing_values:\n",
+ " df[column].fillna(df[column].mean(), inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "id": "ba38a86a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Null values after handling:\n",
+ "Age 0\n",
+ "Sex 0\n",
+ "ALB 0\n",
+ "ALP 0\n",
+ "ALT 0\n",
+ "AST 0\n",
+ "BIL 0\n",
+ "CHE 0\n",
+ "CHOL 0\n",
+ "CREA 0\n",
+ "GGT 0\n",
+ "PROT 0\n",
+ "Category_0=Blood Donor 0\n",
+ "Category_0s=suspect Blood Donor 0\n",
+ "Category_1=Hepatitis 0\n",
+ "Category_2=Fibrosis 0\n",
+ "Category_3=Cirrhosis 0\n",
+ "dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "null_values_after = df.isnull().sum() # Checking for null values after handling\n",
+ "print(\"Null values after handling:\")\n",
+ "print(null_values_after)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fa6c8dd9",
+ "metadata": {},
+ "source": [
+ "## 5. Calculate the memory usage differences"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "id": "aee464f4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Memory usage before processing: 0.06 MB\n",
+ "\n",
+ "RangeIndex: 615 entries, 0 to 614\n",
+ "Data columns (total 17 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 Age 615 non-null int64 \n",
+ " 1 Sex 615 non-null int32 \n",
+ " 2 ALB 615 non-null float64\n",
+ " 3 ALP 615 non-null float64\n",
+ " 4 ALT 615 non-null float64\n",
+ " 5 AST 615 non-null float64\n",
+ " 6 BIL 615 non-null float64\n",
+ " 7 CHE 615 non-null float64\n",
+ " 8 CHOL 615 non-null float64\n",
+ " 9 CREA 615 non-null float64\n",
+ " 10 GGT 615 non-null float64\n",
+ " 11 PROT 615 non-null float64\n",
+ " 12 Category_0=Blood Donor 615 non-null bool \n",
+ " 13 Category_0s=suspect Blood Donor 615 non-null bool \n",
+ " 14 Category_1=Hepatitis 615 non-null bool \n",
+ " 15 Category_2=Fibrosis 615 non-null bool \n",
+ " 16 Category_3=Cirrhosis 615 non-null bool \n",
+ "dtypes: bool(5), float64(10), int32(1), int64(1)\n",
+ "memory usage: 58.4 KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "memory_usage_before = df.memory_usage(deep=True).sum()\n",
+ "print(f\"Memory usage before processing: {memory_usage_before / 1024 ** 2:.2f} MB\")\n",
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "id": "6b6aa90d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Memory usage after processing: 0.06 MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "memory_usage_after = df.memory_usage(deep=True).sum()\n",
+ "print(f\"Memory usage after processing: {memory_usage_after / 1024 ** 2:.2f} MB\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9d0f29a3",
+ "metadata": {},
+ "source": [
+ "## 6. Explore the statistical facts like mean, median, x percentiles of the columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "59fd3aad",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Age | \n",
+ " Sex | \n",
+ " ALB | \n",
+ " ALP | \n",
+ " ALT | \n",
+ " AST | \n",
+ " BIL | \n",
+ " CHE | \n",
+ " CHOL | \n",
+ " CREA | \n",
+ " GGT | \n",
+ " PROT | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ " 615.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 47.408130 | \n",
+ " 0.613008 | \n",
+ " 41.620195 | \n",
+ " 68.283920 | \n",
+ " 28.450814 | \n",
+ " 34.786341 | \n",
+ " 11.396748 | \n",
+ " 8.196634 | \n",
+ " 5.368099 | \n",
+ " 81.287805 | \n",
+ " 39.533171 | \n",
+ " 72.044137 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 10.055105 | \n",
+ " 0.487458 | \n",
+ " 5.775920 | \n",
+ " 25.643955 | \n",
+ " 25.448940 | \n",
+ " 33.090690 | \n",
+ " 19.673150 | \n",
+ " 2.205657 | \n",
+ " 1.123466 | \n",
+ " 49.756166 | \n",
+ " 54.661071 | \n",
+ " 5.398234 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 19.000000 | \n",
+ " 0.000000 | \n",
+ " 14.900000 | \n",
+ " 11.300000 | \n",
+ " 0.900000 | \n",
+ " 10.600000 | \n",
+ " 0.800000 | \n",
+ " 1.420000 | \n",
+ " 1.430000 | \n",
+ " 8.000000 | \n",
+ " 4.500000 | \n",
+ " 44.800000 | \n",
+ "
\n",
+ " \n",
+ " 10% | \n",
+ " 34.000000 | \n",
+ " 0.000000 | \n",
+ " 35.640000 | \n",
+ " 42.740000 | \n",
+ " 11.900000 | \n",
+ " 18.180000 | \n",
+ " 3.700000 | \n",
+ " 5.614000 | \n",
+ " 4.038000 | \n",
+ " 61.000000 | \n",
+ " 11.900000 | \n",
+ " 66.300000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 39.000000 | \n",
+ " 0.000000 | \n",
+ " 38.800000 | \n",
+ " 52.950000 | \n",
+ " 16.400000 | \n",
+ " 21.600000 | \n",
+ " 5.300000 | \n",
+ " 6.935000 | \n",
+ " 4.620000 | \n",
+ " 67.000000 | \n",
+ " 15.700000 | \n",
+ " 69.300000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 47.000000 | \n",
+ " 1.000000 | \n",
+ " 41.900000 | \n",
+ " 66.700000 | \n",
+ " 23.000000 | \n",
+ " 25.900000 | \n",
+ " 7.300000 | \n",
+ " 8.260000 | \n",
+ " 5.310000 | \n",
+ " 77.000000 | \n",
+ " 23.300000 | \n",
+ " 72.200000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 54.000000 | \n",
+ " 1.000000 | \n",
+ " 45.200000 | \n",
+ " 79.300000 | \n",
+ " 33.050000 | \n",
+ " 32.900000 | \n",
+ " 11.200000 | \n",
+ " 9.590000 | \n",
+ " 6.055000 | \n",
+ " 88.000000 | \n",
+ " 40.200000 | \n",
+ " 75.400000 | \n",
+ "
\n",
+ " \n",
+ " 99% | \n",
+ " 71.000000 | \n",
+ " 1.000000 | \n",
+ " 53.258000 | \n",
+ " 137.158000 | \n",
+ " 118.086000 | \n",
+ " 187.322000 | \n",
+ " 87.640000 | \n",
+ " 13.851600 | \n",
+ " 8.580400 | \n",
+ " 134.826000 | \n",
+ " 292.534000 | \n",
+ " 82.686000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 77.000000 | \n",
+ " 1.000000 | \n",
+ " 82.200000 | \n",
+ " 416.600000 | \n",
+ " 325.300000 | \n",
+ " 324.000000 | \n",
+ " 254.000000 | \n",
+ " 16.410000 | \n",
+ " 9.670000 | \n",
+ " 1079.100000 | \n",
+ " 650.900000 | \n",
+ " 90.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Age Sex ALB ALP ALT AST \\\n",
+ "count 615.000000 615.000000 615.000000 615.000000 615.000000 615.000000 \n",
+ "mean 47.408130 0.613008 41.620195 68.283920 28.450814 34.786341 \n",
+ "std 10.055105 0.487458 5.775920 25.643955 25.448940 33.090690 \n",
+ "min 19.000000 0.000000 14.900000 11.300000 0.900000 10.600000 \n",
+ "10% 34.000000 0.000000 35.640000 42.740000 11.900000 18.180000 \n",
+ "25% 39.000000 0.000000 38.800000 52.950000 16.400000 21.600000 \n",
+ "50% 47.000000 1.000000 41.900000 66.700000 23.000000 25.900000 \n",
+ "75% 54.000000 1.000000 45.200000 79.300000 33.050000 32.900000 \n",
+ "99% 71.000000 1.000000 53.258000 137.158000 118.086000 187.322000 \n",
+ "max 77.000000 1.000000 82.200000 416.600000 325.300000 324.000000 \n",
+ "\n",
+ " BIL CHE CHOL CREA GGT PROT \n",
+ "count 615.000000 615.000000 615.000000 615.000000 615.000000 615.000000 \n",
+ "mean 11.396748 8.196634 5.368099 81.287805 39.533171 72.044137 \n",
+ "std 19.673150 2.205657 1.123466 49.756166 54.661071 5.398234 \n",
+ "min 0.800000 1.420000 1.430000 8.000000 4.500000 44.800000 \n",
+ "10% 3.700000 5.614000 4.038000 61.000000 11.900000 66.300000 \n",
+ "25% 5.300000 6.935000 4.620000 67.000000 15.700000 69.300000 \n",
+ "50% 7.300000 8.260000 5.310000 77.000000 23.300000 72.200000 \n",
+ "75% 11.200000 9.590000 6.055000 88.000000 40.200000 75.400000 \n",
+ "99% 87.640000 13.851600 8.580400 134.826000 292.534000 82.686000 \n",
+ "max 254.000000 16.410000 9.670000 1079.100000 650.900000 90.000000 "
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.describe(percentiles=[0.1,0.25,0.5,0.75,0.99]) # describes numerical interpretations of the dataset in terms of mean, max, min, quartiles, etc.\n",
+ "# percentiles considered: 10, 25, 50, 75, 99"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}