diff --git a/Movie Genre Classification/Movie_genre_classification.ipynb b/Movie Genre Classification/Movie_genre_classification.ipynb new file mode 100644 index 00000000..7d4797a1 --- /dev/null +++ b/Movie Genre Classification/Movie_genre_classification.ipynb @@ -0,0 +1,2538 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Imporitng the Dependencies" + ], + "metadata": { + "id": "xAnsf_OhJSwu" + } + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "zr0gTcDqFfd9" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import re # for pattern matching and text manipulation.\n", + "import string\n", + "import nltk\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import PorterStemmer, WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import CountVectorizer as CV\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.svm import SVC\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import classification_report, confusion_matrix\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Loading the Data" + ], + "metadata": { + "id": "dZqzFjZ8JgYu" + } + }, + { + "cell_type": "code", + "source": [ + "train_data = pd.read_csv(\"/content/train_data.txt\", sep=':::', names=[\"title\", \"genre\", \"description\"], engine='python')\n", + "test_data = pd.read_csv(\"/content/test_data.txt\", sep=':::', names=[\"title\", \"description\"], engine='python')" + ], + "metadata": { + "id": "Mu6b8P7tMAsi" + }, + "execution_count": 34, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "**Data Visualizaion**" + ], + "metadata": { + "id": "Z1SLzko8UTnp" + } + }, + { + "cell_type": "code", + "source": [ + "train_data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "BMdL2ffAUNpS", + "outputId": "abeb99d2-dd32-47aa-8c27-1c60ba3e3c8f" + }, + "execution_count": 35, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title genre \\\n", + "1 Oscar et la dame rose (2009) drama \n", + "2 Cupid (1997) thriller \n", + "3 Young, Wild and Wonderful (1980) adult \n", + "4 The Secret Sin (1915) drama \n", + "5 The Unrecovered (2007) drama \n", + "\n", + " description \n", + "1 Listening in to a conversation between his do... \n", + "2 A brother and sister with a past incestuous r... \n", + "3 As the bus empties the students for their fie... \n", + "4 To help their unemployed father make ends mee... \n", + "5 The film's title refers not only to the un-re... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlegenredescription
1Oscar et la dame rose (2009)dramaListening in to a conversation between his do...
2Cupid (1997)thrillerA brother and sister with a past incestuous r...
3Young, Wild and Wonderful (1980)adultAs the bus empties the students for their fie...
4The Secret Sin (1915)dramaTo help their unemployed father make ends mee...
5The Unrecovered (2007)dramaThe film's title refers not only to the un-re...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 35 + } + ] + }, + { + "cell_type": "code", + "source": [ + "test_data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "MhwEX9IjUk3y", + "outputId": "61dff6bf-e460-4975-bf5b-aaa98f3b5163" + }, + "execution_count": 36, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title \\\n", + "1 Edgar's Lunch (1998) \n", + "2 La guerra de papá (1977) \n", + "3 Off the Beaten Track (2010) \n", + "4 Meu Amigo Hindu (2015) \n", + "5 Er nu zhai (1955) \n", + "\n", + " description \n", + "1 L.R. Brane loves his life - his car, his apar... \n", + "2 Spain, March 1964: Quico is a very naughty ch... \n", + "3 One year in the life of Albin and his family ... \n", + "4 His father has died, he hasn't spoken with hi... \n", + "5 Before he was known internationally as a mart... " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titledescription
1Edgar's Lunch (1998)L.R. Brane loves his life - his car, his apar...
2La guerra de papá (1977)Spain, March 1964: Quico is a very naughty ch...
3Off the Beaten Track (2010)One year in the life of Albin and his family ...
4Meu Amigo Hindu (2015)His father has died, he hasn't spoken with hi...
5Er nu zhai (1955)Before he was known internationally as a mart...
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 36 + } + ] + }, + { + "cell_type": "code", + "source": [ + "plt.figure(figsize=(30,10))\n", + "counts = train_data.genre.value_counts()\n", + "sns.barplot(x=counts.index, y=counts)\n", + "plt.xlabel('Genre')\n", + "plt.ylabel('Count')\n", + "plt.xticks(rotation=90);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 554 + }, + "id": "V1nlOD9kb599", + "outputId": "7a949d80-66ec-4f25-9e45-bbe0b462d2b4" + }, + "execution_count": 37, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "code", + "source": [ + "train_data.describe()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 175 + }, + "id": "WzU5-mYPb6Qt", + "outputId": "e5697c7d-004c-44e7-beb5-501ae35560aa" + }, + "execution_count": 38, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title genre \\\n", + "count 54214 54214 \n", + "unique 54214 27 \n", + "top Oscar et la dame rose (2009) drama \n", + "freq 1 13613 \n", + "\n", + " description \n", + "count 54214 \n", + "unique 54086 \n", + "top Grammy - music award of the American academy ... \n", + "freq 12 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlegenredescription
count542145421454214
unique542142754086
topOscar et la dame rose (2009)dramaGrammy - music award of the American academy ...
freq11361312
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 38 + } + ] + }, + { + "cell_type": "code", + "source": [ + "train_data.genre.value_counts()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rI6lrrOqb6yH", + "outputId": "188a9671-cb81-4410-d5ff-8e522e242200" + }, + "execution_count": 39, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " drama 13613\n", + " documentary 13096\n", + " comedy 7447\n", + " short 5073\n", + " horror 2204\n", + " thriller 1591\n", + " action 1315\n", + " western 1032\n", + " reality-tv 884\n", + " family 784\n", + " adventure 775\n", + " music 731\n", + " romance 672\n", + " sci-fi 647\n", + " adult 590\n", + " crime 505\n", + " animation 498\n", + " sport 432\n", + " talk-show 391\n", + " fantasy 323\n", + " mystery 319\n", + " musical 277\n", + " biography 265\n", + " history 243\n", + " game-show 194\n", + " news 181\n", + " war 132\n", + "Name: genre, dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 39 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Data Cleaning & Preprocessing" + ], + "metadata": { + "id": "rRW7-cLNcJls" + } + }, + { + "cell_type": "code", + "source": [ + "train_data.info()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ob5mql64b7HL", + "outputId": "271b8a6b-ff95-40d0-8e6d-730d8af7f4b0" + }, + "execution_count": 40, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Int64Index: 54214 entries, 1 to 54214\n", + "Data columns (total 3 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 title 54214 non-null object\n", + " 1 genre 54214 non-null object\n", + " 2 description 54214 non-null object\n", + "dtypes: object(3)\n", + "memory usage: 1.7+ MB\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "train_data.isnull().sum()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Nx2spN5kb7gW", + "outputId": "aa261554-a9ce-4052-f095-70a7291c6050" + }, + "execution_count": 41, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "title 0\n", + "genre 0\n", + "description 0\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 41 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(\"shape before drop nulls\",train_data.shape)\n", + "# Droping the redundant data\n", + "train_data = train_data.drop_duplicates()\n", + "print(\"shape after drop nulls\",train_data.shape)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "R4-PYCq3b71k", + "outputId": "c0a71f9a-b1d0-4379-8fdd-a069678b9f7b" + }, + "execution_count": 42, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "shape before drop nulls (54214, 3)\n", + "shape after drop nulls (54214, 3)\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Text Cleaning" + ], + "metadata": { + "id": "nxqUU3-ScRob" + } + }, + { + "cell_type": "code", + "source": [ + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "nltk.download()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b-uFVp55b8Sz", + "outputId": "4bb0fd96-7525-4076-e9df-8cf982135d17" + }, + "execution_count": 45, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NLTK Downloader\n", + "---------------------------------------------------------------------------\n", + " d) Download l) List u) Update c) Config h) Help q) Quit\n", + "---------------------------------------------------------------------------\n", + "Downloader> q\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 45 + } + ] + }, + { + "cell_type": "code", + "source": [ + "stop_words = set(stopwords.words('english')) # stop words\n", + "def clean_data(text):\n", + " text = text.lower()\n", + " text = re.sub(r'@\\S+','',text) # [1] remove mentions\n", + " text = re.sub(r'http\\S+', '', text) # [2] remove urls\n", + " text = re.sub(r'[\\w\\.-]+@[\\w\\.-]+',\"\",text) # [3] remove emails\n", + " text = re.sub(r\"[^a-zA-Z+']\", ' ', text) # [4] keep only english chars / remove numbers\n", + " text = re.sub(r'\\s+[a-zA-Z]\\s+', ' ', text+' ') # [5] remove single chars\n", + " text = re.sub(r'pic.\\S+', '',text) # [6]\n", + " text = re.sub(r'#', \"\", text) # [7] remove hashtags\n", + " text = re.sub(r\"_\", \" \", text) # [8] remove hashtags\n", + " text = re.sub('\\n',\" . \",text) # [9] remove new lines\n", + " text = re.sub('\\[[^]]*\\]','',text) # [10] remove square prackets\n", + " text = \"\".join([char for char in text if char not in string.punctuation]) # [11] remove punctuations\n", + " text= re.sub(\"\\s[\\s]+\", \" \",text).strip() # [12] remove repeated/leading/trailing spaces\n", + " tokens = word_tokenize(text) # [13] Tokenize\n", + " text = \" \".join([word for word in tokens if word not in stop_words and len(word) > 2]) # [14] remove stop words\n", + "\n", + " return text" + ], + "metadata": { + "id": "v94Qmrhcb8hB" + }, + "execution_count": 46, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Some Data Statistics" + ], + "metadata": { + "id": "gDuUdSM9ccnt" + } + }, + { + "cell_type": "code", + "source": [ + "import nltk\n", + "nltk.download('punkt')\n", + "train_data['description_cleaned'] = train_data['description'].apply(clean_data)\n", + "test_data['description_cleaned'] = test_data['description'].apply(clean_data)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ce58fCieb8tf", + "outputId": "d7ab0804-6e4e-4798-a057-8572fc1aadf3" + }, + "execution_count": 49, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Stemming" + ], + "metadata": { + "id": "7_VzRAg_chWT" + } + }, + { + "cell_type": "code", + "source": [ + "st = PorterStemmer()\n", + "train_data['description_cleaned'] = train_data['description_cleaned'].apply(\n", + " lambda x: ' '.join([st.stem(word) for word in x.split()]))\n", + "\n", + "test_data['description_cleaned'] = test_data['description_cleaned'].apply(\n", + " lambda x: ' '.join([st.stem(word) for word in x.split()]))" + ], + "metadata": { + "id": "U1h6vv4db86u" + }, + "execution_count": 50, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "train_data['length']=train_data['description'].apply(len)\n", + "train_data['length_cleaned']=train_data['description_cleaned'].apply(len)\n", + "train_data.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "PVPsLkmyb9HV", + "outputId": "1cf046af-7657-4492-9047-3a93e0eb67ec" + }, + "execution_count": 51, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " title genre \\\n", + "1 Oscar et la dame rose (2009) drama \n", + "2 Cupid (1997) thriller \n", + "3 Young, Wild and Wonderful (1980) adult \n", + "4 The Secret Sin (1915) drama \n", + "5 The Unrecovered (2007) drama \n", + "\n", + " description \\\n", + "1 Listening in to a conversation between his do... \n", + "2 A brother and sister with a past incestuous r... \n", + "3 As the bus empties the students for their fie... \n", + "4 To help their unemployed father make ends mee... \n", + "5 The film's title refers not only to the un-re... \n", + "\n", + " description_cleaned length length_cleaned \n", + "1 listen convers doctor parent year old oscar le... 546 339 \n", + "2 brother sister past incestu relationship curre... 184 111 \n", + "3 bu empti student field trip museum natur histo... 650 344 \n", + "4 help unemploy father make end meet edith twin ... 1082 678 \n", + "5 film titl refer recov bodi ground zero also st... 625 347 " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titlegenredescriptiondescription_cleanedlengthlength_cleaned
1Oscar et la dame rose (2009)dramaListening in to a conversation between his do...listen convers doctor parent year old oscar le...546339
2Cupid (1997)thrillerA brother and sister with a past incestuous r...brother sister past incestu relationship curre...184111
3Young, Wild and Wonderful (1980)adultAs the bus empties the students for their fie...bu empti student field trip museum natur histo...650344
4The Secret Sin (1915)dramaTo help their unemployed father make ends mee...help unemploy father make end meet edith twin ...1082678
5The Unrecovered (2007)dramaThe film's title refers not only to the un-re...film titl refer recov bodi ground zero also st...625347
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 51 + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(\"Average Length of Text Before Cleaning: \", train_data['length'].mean())\n", + "print(\"Average Length of Text After Cleaning: \", train_data['length_cleaned'].mean())" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QSUwLDjMb9R9", + "outputId": "826d34c1-a181-483d-bf02-f2da18ee4115" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Average Length of Text Before Cleaning: 600.4524292618142\n", + "Average Length of Text After Cleaning: 360.2900542295348\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "plt.figure(figsize=(8, 7))\n", + "\n", + "sns.histplot(data=train_data, x='length', bins=20, kde=True, color='blue')\n", + "\n", + "plt.xlabel('Length', fontsize=14, fontweight='bold')\n", + "plt.ylabel('Frequency', fontsize=14, fontweight='bold')\n", + "plt.title('Distribution of Lengths', fontsize=16, fontweight='bold')\n", + "\n", + "plt.show()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 648 + }, + "id": "MvHdc239b9cs", + "outputId": "910dac34-a6e2-43bd-dcfb-a23d8117d9f9" + }, + "execution_count": 53, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Get Top bigrams" + ], + "metadata": { + "id": "Hd9C9jWEcsCv" + } + }, + { + "cell_type": "code", + "source": [ + "def get_top_n_bigram(corpus, n=None):\n", + " vec = CV(ngram_range=(2, 2)).fit(corpus)\n", + " bag_of_words = vec.transform(corpus)\n", + " sum_words = bag_of_words.sum(axis=0)\n", + " words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]\n", + " words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)\n", + " return words_freq[:n]" + ], + "metadata": { + "id": "ut_pswWab9mx" + }, + "execution_count": 54, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "common_words = get_top_n_bigram(train_data['description_cleaned'], 10)\n", + "common_words_df = pd.DataFrame(common_words,columns=['word','freq'])\n", + "plt.figure(figsize=(10, 6))\n", + "ax = sns.barplot(x='freq', y='word', data=common_words_df,facecolor='yellow',linewidth=3,edgecolor=sns.color_palette(\"ch:start=3, rot=.1\",10))\n", + "\n", + "plt.title(\"Top 10 bigrams\",font='Serif')\n", + "plt.xlabel(\"Frequency\", fontsize=10)\n", + "plt.yticks(fontsize=13)\n", + "plt.xticks(rotation=45, fontsize=10)\n", + "plt.ylabel(\"\");" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 582 + }, + "id": "Uz810EoKb9wE", + "outputId": "a4d26ea4-975d-40fc-e1a7-dbe414b47f1e" + }, + "execution_count": 55, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "
" + ], + "image/png": "\n" + }, + "metadata": {} + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Label Encodeing of the Target variable" + ], + "metadata": { + "id": "hwiJC56nczF3" + } + }, + { + "cell_type": "code", + "source": [ + "le = LabelEncoder()\n", + "train_data['genre'] = le.fit_transform(train_data['genre'].values)\n", + "\n", + "# keep only relevent columns\n", + "train_df = train_data.loc[:,['description_cleaned', 'genre']]\n", + "test_df = test_data.loc[:,['description_cleaned', 'title']]\n", + "train_df.head(10)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 363 + }, + "id": "8woFS3nHcz8x", + "outputId": "a0670398-5d29-425e-cfd9-c6671a963590" + }, + "execution_count": 56, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " description_cleaned genre\n", + "1 listen convers doctor parent year old oscar le... 8\n", + "2 brother sister past incestu relationship curre... 24\n", + "3 bu empti student field trip museum natur histo... 1\n", + "4 help unemploy father make end meet edith twin ... 8\n", + "5 film titl refer recov bodi ground zero also st... 8\n", + "6 qualiti control consist seri singl take shot f... 7\n", + "7 tough econom time max joey run idea discov sen... 5\n", + "8 ron petri keanu reev troubl teen whose life ha... 6\n", + "9 sudden calamit event caus great loss life dama... 18\n", + "10 four high school student embark terrifi journe... 13" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
description_cleanedgenre
1listen convers doctor parent year old oscar le...8
2brother sister past incestu relationship curre...24
3bu empti student field trip museum natur histo...1
4help unemploy father make end meet edith twin ...8
5film titl refer recov bodi ground zero also st...8
6qualiti control consist seri singl take shot f...7
7tough econom time max joey run idea discov sen...5
8ron petri keanu reev troubl teen whose life ha...6
9sudden calamit event caus great loss life dama...18
10four high school student embark terrifi journe...13
\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ] + }, + "metadata": {}, + "execution_count": 56 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Train Test Split" + ], + "metadata": { + "id": "wf-EIfUjc44T" + } + }, + { + "cell_type": "code", + "source": [ + "train_set , val_set , train_label , val_label = train_test_split(train_df['description_cleaned'] , train_data['genre'] , test_size=0.2 , shuffle=True , random_state = 42)\n", + "\n", + "print(f'Split data into train and eval sets')\n", + "print(f'Trani Set\\t: {len(train_set)}\\nValidation Set\\t: {len(val_set)}')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "M_GZ6Etlc08G", + "outputId": "826e59b0-a431-41d4-d5d0-76efd87fd1b0" + }, + "execution_count": 57, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Split data into train and eval sets\n", + "Trani Set\t: 43371\n", + "Validation Set\t: 10843\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Feature Extraction" + ], + "metadata": { + "id": "Y3hByqDec9g8" + } + }, + { + "cell_type": "code", + "source": [ + "# using TF-IDF\n", + "vectorize = TfidfVectorizer(stop_words='english', max_features=100000)\n", + "train_set_tfidf = vectorize.fit_transform(train_set)\n", + "val_set_tfidf = vectorize.transform(val_set)" + ], + "metadata": { + "id": "ZjKOXXHWc0xA" + }, + "execution_count": 58, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#using BOW\n", + "vectorize = CV()\n", + "train_set_tfidf = vectorize.fit_transform(train_set)\n", + "val_set_tfidf = vectorize.transform(val_set)" + ], + "metadata": { + "id": "vQ6KcnqPc0kv" + }, + "execution_count": 60, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "[1] Logistic Regression Model (LR)" + ], + "metadata": { + "id": "UK_8NaUcdBxJ" + } + }, + { + "cell_type": "code", + "source": [ + "LR_model = LogisticRegression()\n", + "LR_model.fit(train_set_tfidf, train_label)\n", + "predict_LR = LR_model.predict(val_set_tfidf)\n", + "print(classification_report(val_label, predict_LR))\n", + "LR_accuracy = accuracy_score(predict_LR,val_label)\n", + "print('Logistic Regression accuracy is: {:.2f}%'.format(LR_accuracy*100))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ACUMWJh6c0SM", + "outputId": "d73f33c8-5566-4948-ee54-78c1a1fe939d" + }, + "execution_count": 61, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.39 0.30 0.34 263\n", + " 1 0.68 0.46 0.55 112\n", + " 2 0.31 0.23 0.26 139\n", + " 3 0.38 0.22 0.28 104\n", + " 4 0.00 0.00 0.00 61\n", + " 5 0.51 0.56 0.53 1443\n", + " 6 0.26 0.10 0.15 107\n", + " 7 0.69 0.76 0.73 2659\n", + " 8 0.55 0.67 0.61 2697\n", + " 9 0.28 0.19 0.23 150\n", + " 10 0.23 0.07 0.10 74\n", + " 11 0.80 0.50 0.62 40\n", + " 12 0.07 0.02 0.03 45\n", + " 13 0.63 0.61 0.62 431\n", + " 14 0.53 0.53 0.53 144\n", + " 15 0.22 0.08 0.12 50\n", + " 16 0.11 0.04 0.05 56\n", + " 17 0.18 0.06 0.09 34\n", + " 18 0.49 0.35 0.41 192\n", + " 19 0.20 0.09 0.12 151\n", + " 20 0.45 0.27 0.34 143\n", + " 21 0.38 0.38 0.38 1045\n", + " 22 0.61 0.37 0.46 93\n", + " 23 0.53 0.23 0.32 81\n", + " 24 0.29 0.23 0.26 309\n", + " 25 0.40 0.20 0.27 20\n", + " 26 0.88 0.78 0.82 200\n", + "\n", + " accuracy 0.55 10843\n", + " macro avg 0.41 0.31 0.34 10843\n", + "weighted avg 0.53 0.55 0.54 10843\n", + "\n", + "Logistic Regression accuracy is: 55.46%\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "[2] Multinomial Naive Bayes Model (MultinomialNB)" + ], + "metadata": { + "id": "xpibG5K_dQRx" + } + }, + { + "cell_type": "code", + "source": [ + "# Train a Naive Bayes classifier\n", + "NB_model = MultinomialNB()\n", + "NB_model.fit(train_set_tfidf, train_label)\n", + "y_pred_naive = NB_model.predict(val_set_tfidf)\n", + "print(classification_report(val_label, y_pred_naive))\n", + "naive_accuracy = accuracy_score(y_pred_naive,val_label)\n", + "print('Naive Bayes model accuracy is: {:.2f}%'.format(naive_accuracy*100))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HUk-bMmsdNbU", + "outputId": "d01bd6a3-1b51-4d30-d59d-e92a238b6159" + }, + "execution_count": 62, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.72 0.08 0.14 263\n", + " 1 0.60 0.05 0.10 112\n", + " 2 0.33 0.06 0.10 139\n", + " 3 0.00 0.00 0.00 104\n", + " 4 0.00 0.00 0.00 61\n", + " 5 0.51 0.50 0.50 1443\n", + " 6 0.00 0.00 0.00 107\n", + " 7 0.57 0.88 0.69 2659\n", + " 8 0.47 0.80 0.59 2697\n", + " 9 0.50 0.01 0.01 150\n", + " 10 0.00 0.00 0.00 74\n", + " 11 1.00 0.07 0.14 40\n", + " 12 0.00 0.00 0.00 45\n", + " 13 0.74 0.36 0.49 431\n", + " 14 0.75 0.12 0.21 144\n", + " 15 0.00 0.00 0.00 50\n", + " 16 0.00 0.00 0.00 56\n", + " 17 0.00 0.00 0.00 34\n", + " 18 1.00 0.01 0.02 192\n", + " 19 0.00 0.00 0.00 151\n", + " 20 0.75 0.02 0.04 143\n", + " 21 0.57 0.14 0.23 1045\n", + " 22 0.75 0.10 0.17 93\n", + " 23 0.00 0.00 0.00 81\n", + " 24 0.42 0.03 0.05 309\n", + " 25 0.00 0.00 0.00 20\n", + " 26 0.95 0.63 0.76 200\n", + "\n", + " accuracy 0.53 10843\n", + " macro avg 0.39 0.14 0.16 10843\n", + "weighted avg 0.52 0.53 0.45 10843\n", + "\n", + "Naive Bayes model accuracy is: 52.86%\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "[3] Decision Tree Model (ID3)" + ], + "metadata": { + "id": "3MfpYtXbdWXc" + } + }, + { + "cell_type": "code", + "source": [ + "DT = DecisionTreeClassifier(max_depth=(1), random_state=0)\n", + "DT.fit(train_set_tfidf, train_label)\n", + "predict_ID3 = DT.predict(val_set_tfidf)\n", + "print(classification_report(val_label, predict_ID3))\n", + "ID3_accuracy = accuracy_score(predict_ID3,val_label)\n", + "print('ID3 model accuracy is: {:.2f}%'.format(ID3_accuracy*100))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "k1uJXt8CdNQJ", + "outputId": "692e38cd-93f0-4ded-f5e6-a49546b33a3a" + }, + "execution_count": 63, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.00 0.00 0.00 263\n", + " 1 0.00 0.00 0.00 112\n", + " 2 0.00 0.00 0.00 139\n", + " 3 0.00 0.00 0.00 104\n", + " 4 0.00 0.00 0.00 61\n", + " 5 0.00 0.00 0.00 1443\n", + " 6 0.00 0.00 0.00 107\n", + " 7 0.82 0.31 0.45 2659\n", + " 8 0.27 0.99 0.43 2697\n", + " 9 0.00 0.00 0.00 150\n", + " 10 0.00 0.00 0.00 74\n", + " 11 0.00 0.00 0.00 40\n", + " 12 0.00 0.00 0.00 45\n", + " 13 0.00 0.00 0.00 431\n", + " 14 0.00 0.00 0.00 144\n", + " 15 0.00 0.00 0.00 50\n", + " 16 0.00 0.00 0.00 56\n", + " 17 0.00 0.00 0.00 34\n", + " 18 0.00 0.00 0.00 192\n", + " 19 0.00 0.00 0.00 151\n", + " 20 0.00 0.00 0.00 143\n", + " 21 0.00 0.00 0.00 1045\n", + " 22 0.00 0.00 0.00 93\n", + " 23 0.00 0.00 0.00 81\n", + " 24 0.00 0.00 0.00 309\n", + " 25 0.00 0.00 0.00 20\n", + " 26 0.00 0.00 0.00 200\n", + "\n", + " accuracy 0.32 10843\n", + " macro avg 0.04 0.05 0.03 10843\n", + "weighted avg 0.27 0.32 0.22 10843\n", + "\n", + "ID3 model accuracy is: 32.30%\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "[4] Support Vector Machine Model (SVC)" + ], + "metadata": { + "id": "yaDiam7_ddht" + } + }, + { + "cell_type": "code", + "source": [ + "# Train a SVC classifier\n", + "from sklearn.svm import LinearSVC\n", + "svm_model = LinearSVC()\n", + "svm_model.fit(train_set_tfidf, train_label)\n", + "predict = svm_model.predict(val_set_tfidf)\n", + "\n", + "print(classification_report(val_label, predict))\n", + "svm_accuracy = accuracy_score(predict,val_label)\n", + "print('SVC model accuracy is: {:.2f}%'.format(svm_accuracy*100))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WgtSuPc3dMdR", + "outputId": "1d5b85fe-4bc5-45d1-f939-f2f06a13eca9" + }, + "execution_count": 64, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.36 0.31 0.33 263\n", + " 1 0.47 0.38 0.42 112\n", + " 2 0.22 0.19 0.20 139\n", + " 3 0.27 0.16 0.20 104\n", + " 4 0.03 0.02 0.02 61\n", + " 5 0.48 0.51 0.49 1443\n", + " 6 0.13 0.09 0.11 107\n", + " 7 0.68 0.68 0.68 2659\n", + " 8 0.52 0.58 0.55 2697\n", + " 9 0.21 0.18 0.20 150\n", + " 10 0.19 0.11 0.14 74\n", + " 11 0.69 0.55 0.61 40\n", + " 12 0.06 0.02 0.03 45\n", + " 13 0.56 0.56 0.56 431\n", + " 14 0.51 0.54 0.52 144\n", + " 15 0.11 0.10 0.11 50\n", + " 16 0.10 0.07 0.08 56\n", + " 17 0.06 0.03 0.04 34\n", + " 18 0.37 0.30 0.33 192\n", + " 19 0.13 0.08 0.10 151\n", + " 20 0.38 0.28 0.32 143\n", + " 21 0.33 0.35 0.34 1045\n", + " 22 0.50 0.32 0.39 93\n", + " 23 0.30 0.22 0.26 81\n", + " 24 0.19 0.19 0.19 309\n", + " 25 0.12 0.10 0.11 20\n", + " 26 0.78 0.75 0.77 200\n", + "\n", + " accuracy 0.50 10843\n", + " macro avg 0.32 0.28 0.30 10843\n", + "weighted avg 0.49 0.50 0.49 10843\n", + "\n", + "SVC model accuracy is: 49.94%\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py:1244: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n", + " warnings.warn(\n" + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/Movie Genre Classification/README.MD b/Movie Genre Classification/README.MD new file mode 100644 index 00000000..522f46f1 --- /dev/null +++ b/Movie Genre Classification/README.MD @@ -0,0 +1,94 @@ +# Movie Genre Classification using Machine Learning + +## Project Overview + +Welcome to the Movie Genre Classification project! This project is part of the GirlScript Summer of Code program. The objective is to build a machine learning model to classify movies into genres based on various features such as plot summaries, cast, crew, and other metadata. + +## Table of Contents + +- [Introduction](#introduction) +- [Installation](#installation) +- [Dataset](#dataset) +- [Data Preprocessing](#data-preprocessing) +- [Model Training](#model-training) +- [Model Evaluation](#model-evaluation) +- [Usage](#usage) +- [License](#license) + +## Introduction + +Classifying movies into genres can enhance user experience by helping them find movies of their interest more easily. Machine learning can automate this classification process by analyzing various features of the movies. + +In this project, we will: +1. Load and preprocess the movie dataset. +2. Train a machine learning model for genre classification. +3. Evaluate the performance of the model. +4. Provide usage instructions for making predictions on new data. + +## Installation + +To get started with this project, you'll need to have Python installed on your machine. You can install the necessary dependencies using pip: + +```bash +pip install -r requirements.txt +``` + +## Dataset + +The dataset used in this project consists of movie metadata, including plot summaries, cast, crew, and other relevant features. You can use publicly available datasets such as those from [IMDb](https://www.imdb.com/interfaces/) or [TMDb](https://www.themoviedb.org/documentation/api). + +## Data Preprocessing + +Data preprocessing steps include: +1. Loading the dataset. +2. Cleaning the data (handling missing values, removing duplicates, etc.). +3. Extracting relevant features (text processing for plot summaries, encoding categorical variables, etc.). +4. Splitting the dataset into training and testing sets. + +## Model Training + +We will use machine learning algorithms such as Logistic Regression, Random Forest, or a Neural Network for the classification task. The steps for training the model are as follows: +1. Choose the appropriate model. +2. Train the model on the training data. +3. Tune hyperparameters if necessary. + +## Model Evaluation + +To evaluate the performance of the model, we will use metrics such as: +- Accuracy +- Precision +- Recall +- F1-Score + +Confusion matrix and ROC-AUC curve will also be plotted for a comprehensive evaluation. + +## Usage + +To use the trained model for making predictions on new data, follow these steps: + +1. Load the pre-trained model from the saved file. +2. Prepare the input data in the same format as the training data. +3. Use the model to make predictions. + +Example code: + +```python +import pickle +import numpy as np + +# Load the pre-trained model +with open('movie_genre_model.pkl', 'rb') as file: + model = pickle.load(file) + +# Sample input data (replace with actual data) +input_data = np.array([[...]]) + +# Make predictions +predictions = model.predict(input_data) +print(predictions) +``` + + +## License + +This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.