diff --git a/Lung Cancer Detection/Lung-cancer-detection-Exploration.ipynb b/Lung Cancer Detection/Lung-cancer-detection-Exploration.ipynb new file mode 100644 index 00000000..1220d229 --- /dev/null +++ b/Lung Cancer Detection/Lung-cancer-detection-Exploration.ipynb @@ -0,0 +1,655 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "d5540bc1", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "01373691", + "metadata": {}, + "source": [ + "## Loading the Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "79de3d42", + "metadata": {}, + "outputs": [], + "source": [ + "df=pd.read_csv(\"Lung_Cancer_Dataset.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "851ed69b", + "metadata": {}, + "source": [ + "## Features and Label" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9d065c97", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
GENDERAGESMOKINGYELLOW_FINGERSANXIETYPEER_PRESSURECHRONIC DISEASEFATIGUEALLERGYWHEEZINGALCOHOL CONSUMINGCOUGHINGSHORTNESS OF BREATHSWALLOWING DIFFICULTYCHEST PAINLUNG_CANCER
0M691221121222222YES
1M742111222111222YES
2F591112121212212NO
3M632221111121122NO
4F631211111212211NO
\n", + "
" + ], + "text/plain": [ + " GENDER AGE SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE \\\n", + "0 M 69 1 2 2 1 \n", + "1 M 74 2 1 1 1 \n", + "2 F 59 1 1 1 2 \n", + "3 M 63 2 2 2 1 \n", + "4 F 63 1 2 1 1 \n", + "\n", + " CHRONIC DISEASE FATIGUE ALLERGY WHEEZING ALCOHOL CONSUMING COUGHING \\\n", + "0 1 2 1 2 2 2 \n", + "1 2 2 2 1 1 1 \n", + "2 1 2 1 2 1 2 \n", + "3 1 1 1 1 2 1 \n", + "4 1 1 1 2 1 2 \n", + "\n", + " SHORTNESS OF BREATH SWALLOWING DIFFICULTY CHEST PAIN LUNG_CANCER \n", + "0 2 2 2 YES \n", + "1 2 2 2 YES \n", + "2 2 1 2 NO \n", + "3 1 2 2 NO \n", + "4 2 1 1 NO " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "4a8a9caf", + "metadata": {}, + "source": [ + "#### Features :\n", + "GENDER, AGE, SMOKING, YELLOW_FINGERS, ANXIETY, PEER_PRESSURE, CHRONIC_DISEASE, FATIGUE, ALLERGY, WHEEZING, ALCOHOL_CONSUMING, COUGHING, SHORTNESS_OF_BREATH, SWALLOWING_DIFFICULTY, CHEST_PAIN\n", + "\n", + "#### label :\n", + "LUNG_CANCER" + ] + }, + { + "cell_type": "markdown", + "id": "a564ab24", + "metadata": {}, + "source": [ + "## Shape of the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8b5d6d26", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(309, 16)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "markdown", + "id": "f247e019", + "metadata": {}, + "source": [ + "### The dataset contains 309 rows and 16 columns(or 15 features and a label)" + ] + }, + { + "cell_type": "markdown", + "id": "94767c78", + "metadata": {}, + "source": [ + "## Data Type and Memory Usage" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "53064761", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 309 entries, 0 to 308\n", + "Data columns (total 16 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 GENDER 309 non-null object\n", + " 1 AGE 309 non-null int64 \n", + " 2 SMOKING 309 non-null int64 \n", + " 3 YELLOW_FINGERS 309 non-null int64 \n", + " 4 ANXIETY 309 non-null int64 \n", + " 5 PEER_PRESSURE 309 non-null int64 \n", + " 6 CHRONIC DISEASE 309 non-null int64 \n", + " 7 FATIGUE 309 non-null int64 \n", + " 8 ALLERGY 309 non-null int64 \n", + " 9 WHEEZING 309 non-null int64 \n", + " 10 ALCOHOL CONSUMING 309 non-null int64 \n", + " 11 COUGHING 309 non-null int64 \n", + " 12 SHORTNESS OF BREATH 309 non-null int64 \n", + " 13 SWALLOWING DIFFICULTY 309 non-null int64 \n", + " 14 CHEST PAIN 309 non-null int64 \n", + " 15 LUNG_CANCER 309 non-null object\n", + "dtypes: int64(14), object(2)\n", + "memory usage: 38.8+ KB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "id": "f598a82d", + "metadata": {}, + "source": [ + "### One of the features is of \"Object\" data type, rest other features are of \"int64\" data type.\n", + "### Label is of \"Object\" data type.\n", + "### Memory usage : approximately 38.8 KB" + ] + }, + { + "cell_type": "markdown", + "id": "b989c833", + "metadata": {}, + "source": [ + "## Statistical Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0616237a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AGESMOKINGYELLOW_FINGERSANXIETYPEER_PRESSURECHRONIC DISEASEFATIGUEALLERGYWHEEZINGALCOHOL CONSUMINGCOUGHINGSHORTNESS OF BREATHSWALLOWING DIFFICULTYCHEST PAIN
count309.000000309.000000309.000000309.000000309.000000309.000000309.000000309.000000309.000000309.000000309.000000309.000000309.000000309.000000
mean62.6731391.5631071.5695791.4983821.5016181.5048541.6731391.5566341.5566341.5566341.5792881.6407771.4692561.556634
std8.2103010.4968060.4959380.5008080.5008080.5007870.4698270.4975880.4975880.4975880.4944740.4805510.4998630.497588
min21.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
25%57.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000
50%62.0000002.0000002.0000001.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000001.0000002.000000
75%69.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.000000
max87.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.0000002.000000
\n", + "
" + ], + "text/plain": [ + " AGE SMOKING YELLOW_FINGERS ANXIETY PEER_PRESSURE \\\n", + "count 309.000000 309.000000 309.000000 309.000000 309.000000 \n", + "mean 62.673139 1.563107 1.569579 1.498382 1.501618 \n", + "std 8.210301 0.496806 0.495938 0.500808 0.500808 \n", + "min 21.000000 1.000000 1.000000 1.000000 1.000000 \n", + "25% 57.000000 1.000000 1.000000 1.000000 1.000000 \n", + "50% 62.000000 2.000000 2.000000 1.000000 2.000000 \n", + "75% 69.000000 2.000000 2.000000 2.000000 2.000000 \n", + "max 87.000000 2.000000 2.000000 2.000000 2.000000 \n", + "\n", + " CHRONIC DISEASE FATIGUE ALLERGY WHEEZING ALCOHOL CONSUMING \\\n", + "count 309.000000 309.000000 309.000000 309.000000 309.000000 \n", + "mean 1.504854 1.673139 1.556634 1.556634 1.556634 \n", + "std 0.500787 0.469827 0.497588 0.497588 0.497588 \n", + "min 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "25% 1.000000 1.000000 1.000000 1.000000 1.000000 \n", + "50% 2.000000 2.000000 2.000000 2.000000 2.000000 \n", + "75% 2.000000 2.000000 2.000000 2.000000 2.000000 \n", + "max 2.000000 2.000000 2.000000 2.000000 2.000000 \n", + "\n", + " COUGHING SHORTNESS OF BREATH SWALLOWING DIFFICULTY CHEST PAIN \n", + "count 309.000000 309.000000 309.000000 309.000000 \n", + "mean 1.579288 1.640777 1.469256 1.556634 \n", + "std 0.494474 0.480551 0.499863 0.497588 \n", + "min 1.000000 1.000000 1.000000 1.000000 \n", + "25% 1.000000 1.000000 1.000000 1.000000 \n", + "50% 2.000000 2.000000 1.000000 2.000000 \n", + "75% 2.000000 2.000000 2.000000 2.000000 \n", + "max 2.000000 2.000000 2.000000 2.000000 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "id": "b4ee62a9", + "metadata": {}, + "source": [ + "### After sorting in ascending order: \n", + "\n", + "### Average age is around 62, minimum age is 21 and maximum age is 87. \n", + "### 25% of the patients are of 57 and lesser than 57 age. \n", + "### 50% of the patients are of 62 and lesser than 62 age. \n", + "### 75% of the patients are of 69 and lesser than 69 age. \n", + "### Rest other features and label are categorical data, each having two sub categories." + ] + }, + { + "cell_type": "markdown", + "id": "5a62df6e", + "metadata": {}, + "source": [ + "## Null values check" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5d592446", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "GENDER 0\n", + "AGE 0\n", + "SMOKING 0\n", + "YELLOW_FINGERS 0\n", + "ANXIETY 0\n", + "PEER_PRESSURE 0\n", + "CHRONIC DISEASE 0\n", + "FATIGUE 0\n", + "ALLERGY 0\n", + "WHEEZING 0\n", + "ALCOHOL CONSUMING 0\n", + "COUGHING 0\n", + "SHORTNESS OF BREATH 0\n", + "SWALLOWING DIFFICULTY 0\n", + "CHEST PAIN 0\n", + "LUNG_CANCER 0\n", + "dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "id": "44d3ac21", + "metadata": {}, + "source": [ + "### There are no null values in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4995c1c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Lung-cancer-detection-Exploration.ipynb b/Lung-cancer-detection-Exploration.ipynb new file mode 100644 index 00000000..e69de29b