diff --git a/.ipynb_checkpoints/1_Data_Exploration-checkpoint.ipynb b/.ipynb_checkpoints/1_Data_Exploration-checkpoint.ipynb new file mode 100644 index 0000000..154d7e1 --- /dev/null +++ b/.ipynb_checkpoints/1_Data_Exploration-checkpoint.ipynb @@ -0,0 +1,746 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plagiarism Text Data\n", + "\n", + "In this project, you will be tasked with building a plagiarism detector that examines a text file and performs binary classification; labeling that file as either plagiarized or not, depending on how similar the text file is when compared to a provided source text. \n", + "\n", + "The first step in working with any dataset is loading the data in and noting what information is included in the dataset. This is an important step in eventually working with this data, and knowing what kinds of features you have to work with as you transform and group the data!\n", + "\n", + "So, this notebook is all about exploring the data and noting patterns about the features you are given and the distribution of data. \n", + "\n", + "> There are not any exercises or questions in this notebook, it is only meant for exploration. This notebook will note be required in your final project submission.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read in the Data\n", + "\n", + "The cell below will download the necessary data and extract the files into the folder `data/`.\n", + "\n", + "This data is a slightly modified version of a dataset created by Paul Clough (Information Studies) and Mark Stevenson (Computer Science), at the University of Sheffield. You can read all about the data collection and corpus, at [their university webpage](https://ir.shef.ac.uk/cloughie/resources/plagiarism_corpus.html). \n", + "\n", + "> **Citation for data**: Clough, P. and Stevenson, M. Developing A Corpus of Plagiarised Short Answers, Language Resources and Evaluation: Special Issue on Plagiarism and Authorship Analysis, In Press. [Download]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "!wget https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c4147f9_data/data.zip\n", + "!unzip data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This plagiarism dataset is made of multiple text files; each of these files has characteristics that are is summarized in a `.csv` file named `file_information.csv`, which we can read in using `pandas`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategory
0g0pA_taska.txtanon
1g0pA_taskb.txtbcut
2g0pA_taskc.txtclight
3g0pA_taskd.txtdheavy
4g0pA_taske.txtenon
5g0pB_taska.txtanon
6g0pB_taskb.txtbnon
7g0pB_taskc.txtccut
8g0pB_taskd.txtdlight
9g0pB_taske.txteheavy
\n", + "
" + ], + "text/plain": [ + " File Task Category\n", + "0 g0pA_taska.txt a non\n", + "1 g0pA_taskb.txt b cut\n", + "2 g0pA_taskc.txt c light\n", + "3 g0pA_taskd.txt d heavy\n", + "4 g0pA_taske.txt e non\n", + "5 g0pB_taska.txt a non\n", + "6 g0pB_taskb.txt b non\n", + "7 g0pB_taskc.txt c cut\n", + "8 g0pB_taskd.txt d light\n", + "9 g0pB_taske.txt e heavy" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_file = 'data/file_information.csv'\n", + "plagiarism_df = pd.read_csv(csv_file)\n", + "\n", + "# print out the first few rows of data info\n", + "plagiarism_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Types of Plagiarism\n", + "\n", + "Each text file is associated with one **Task** (task A-E) and one **Category** of plagiarism, which you can see in the above DataFrame.\n", + "\n", + "### Five task types, A-E\n", + "\n", + "Each text file contains an answer to one short question; these questions are labeled as tasks A-E.\n", + "* Each task, A-E, is about a topic that might be included in the Computer Science curriculum that was created by the authors of this dataset. \n", + " * For example, Task A asks the question: \"What is inheritance in object oriented programming?\"\n", + "\n", + "### Four categories of plagiarism \n", + "\n", + "Each text file has an associated plagiarism label/category:\n", + "\n", + "1. `cut`: An answer is plagiarized; it is copy-pasted directly from the relevant Wikipedia source text.\n", + "2. `light`: An answer is plagiarized; it is based on the Wikipedia source text and includes some copying and paraphrasing.\n", + "3. `heavy`: An answer is plagiarized; it is based on the Wikipedia source text but expressed using different words and structure. Since this doesn't copy directly from a source text, this will likely be the most challenging kind of plagiarism to detect.\n", + "4. `non`: An answer is not plagiarized; the Wikipedia source text is not used to create this answer.\n", + "5. `orig`: This is a specific category for the original, Wikipedia source text. We will use these files only for comparison purposes.\n", + "\n", + "> So, out of the submitted files, the only category that does not contain any plagiarism is `non`.\n", + "\n", + "In the next cell, print out some statistics about the data." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of files: 100\n", + "Number of unique tasks/question types (A-E): 5\n", + "Unique plagiarism categories: ['non' 'cut' 'light' 'heavy' 'orig']\n" + ] + } + ], + "source": [ + "# print out some stats about the data\n", + "print('Number of files: ', plagiarism_df.shape[0]) # .shape[0] gives the rows \n", + "# .unique() gives unique items in a specified column\n", + "print('Number of unique tasks/question types (A-E): ', (len(plagiarism_df['Task'].unique())))\n", + "print('Unique plagiarism categories: ', (plagiarism_df['Category'].unique()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should see the number of text files in the dataset as well as some characteristics about the `Task` and `Category` columns. **Note that the file count of 100 *includes* the 5 _original_ wikipedia files for tasks A-E.** If you take a look at the files in the `data` directory, you'll notice that the original, source texts start with the filename `orig_` as opposed to `g` for \"group.\" \n", + "\n", + "> So, in total there are 100 files, 95 of which are answers (submitted by people) and 5 of which are the original, Wikipedia source texts.\n", + "\n", + "Your end goal will be to use this information to classify any given answer text into one of two categories, plagiarized or not-plagiarized." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distribution of Data\n", + "\n", + "Next, let's look at the distribution of data. In this course, we've talked about traits like class imbalance that can inform how you develop an algorithm. So, here, we'll ask: **How evenly is our data distributed among different tasks and plagiarism levels?**\n", + "\n", + "Below, you should notice two things:\n", + "* Our dataset is quite small, especially with respect to examples of varying plagiarism levels.\n", + "* The data is distributed fairly evenly across task and plagiarism types." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Task:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TaskCounts
0a20
1b20
2c20
3d20
4e20
\n", + "
" + ], + "text/plain": [ + " Task Counts\n", + "0 a 20\n", + "1 b 20\n", + "2 c 20\n", + "3 d 20\n", + "4 e 20" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Plagiarism Levels:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryCounts
0cut19
1heavy19
2light19
3non38
4orig5
\n", + "
" + ], + "text/plain": [ + " Category Counts\n", + "0 cut 19\n", + "1 heavy 19\n", + "2 light 19\n", + "3 non 38\n", + "4 orig 5" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Task & Plagiarism Level Combos :\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TaskCategoryCounts
0acut4
1aheavy3
2alight3
3anon9
4aorig1
5bcut3
6bheavy4
7blight3
8bnon9
9borig1
10ccut3
11cheavy5
12clight4
13cnon7
14corig1
15dcut4
16dheavy4
17dlight5
18dnon6
19dorig1
20ecut5
21eheavy3
22elight4
23enon7
24eorig1
\n", + "
" + ], + "text/plain": [ + " Task Category Counts\n", + "0 a cut 4\n", + "1 a heavy 3\n", + "2 a light 3\n", + "3 a non 9\n", + "4 a orig 1\n", + "5 b cut 3\n", + "6 b heavy 4\n", + "7 b light 3\n", + "8 b non 9\n", + "9 b orig 1\n", + "10 c cut 3\n", + "11 c heavy 5\n", + "12 c light 4\n", + "13 c non 7\n", + "14 c orig 1\n", + "15 d cut 4\n", + "16 d heavy 4\n", + "17 d light 5\n", + "18 d non 6\n", + "19 d orig 1\n", + "20 e cut 5\n", + "21 e heavy 3\n", + "22 e light 4\n", + "23 e non 7\n", + "24 e orig 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Show counts by different tasks and amounts of plagiarism\n", + "\n", + "# group and count by task\n", + "counts_per_task=plagiarism_df.groupby(['Task']).size().reset_index(name=\"Counts\")\n", + "print(\"\\nTask:\")\n", + "display(counts_per_task)\n", + "\n", + "# group by plagiarism level\n", + "counts_per_category=plagiarism_df.groupby(['Category']).size().reset_index(name=\"Counts\")\n", + "print(\"\\nPlagiarism Levels:\")\n", + "display(counts_per_category)\n", + "\n", + "# group by task AND plagiarism level\n", + "counts_task_and_plagiarism=plagiarism_df.groupby(['Task', 'Category']).size().reset_index(name=\"Counts\")\n", + "print(\"\\nTask & Plagiarism Level Combos :\")\n", + "display(counts_task_and_plagiarism)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It may also be helpful to look at this last DataFrame, graphically.\n", + "\n", + "Below, you can see that the counts follow a pattern broken down by task. Each task has one source text (original) and the highest number on `non` plagiarized cases." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAEyCAYAAAC/Lwo5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAADCFJREFUeJzt3V+MpXddx/HP1w5EW4jWdEKwf1w0xoRwIWRiVAhpQI2iEU0MgQQD3qwXosWYKHoDNybGIMELQ7ICBmOFmFKVGKKQCFFvGnZLI21XlGD5UwtdQiLUm4r9ejGHuK67O2fa53xnz9nXK9nszJnnnPnOb57Je5/nnHm2ujsAwOZ9y0kPAADXC9EFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAkL1NPOgtt9zSp06d2sRDA8A159y5c1/p7v2jtttIdE+dOpWzZ89u4qEB4JpTVZ9bZzunlwFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhG7n2Mv9f1fHv0738HNvOOgLbzJEuAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhqwV3ar6tap6qKoerKr3V9W3bnowANg1R0a3qm5N8qtJDrr7RUluSPLaTQ8GALtm3dPLe0m+rar2ktyY5N83NxIA7KYjo9vdjyZ5e5LPJ3ksyX9090c2PRgA7Jp1Ti/fnOTVSV6Q5LuS3FRVr7/Mdqer6mxVnb1w4cLykwLAllvn9PKPJvm37r7Q3f+V5N4kP3LpRt19prsPuvtgf39/6TkBYOutE93PJ/mhqrqxqirJK5Oc3+xYALB71nlO974k9yS5P8mnVvc5s+G5AGDn7K2zUXe/NclbNzwLAOw0V6QCgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWDI3kkPAMyrOv59upefA663fdGRLgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMWSu6VfUdVXVPVf1zVZ2vqh/e9GAAsGv21tzuD5L8TXf/fFU9O8mNG5wJAHbSkdGtqm9P8vIkb0yS7n4yyZObHQsAds86p5dfkORCkj+uqk9W1bur6qZLN6qq01V1tqrOXrhwYfFBAa4lVU/vD9e3daK7l+QlSd7V3S9O8p9J3nLpRt19prsPuvtgf39/4TEBYPutE90vJvlid9+3ev+eHEYYADiGI6Pb3V9K8oWq+v7VTa9M8vBGpwKAHbTuq5d/Jcndq1cufzbJL25uJADYTWtFt7sfSHKw4VkAYKe5IhUADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAEP2TnoAuN5UHf8+3cvPcb3bhe/DLnwN1xtHugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABiydnSr6oaq+mRV/fUmBwKAXXWcI927kpzf1CAAsOvWim5V3Zbkp5K8e7PjAMDuWvdI951JfiPJUxucBQB22t5RG1TVTyd5vLvPVdWdV9nudJLTSXLHHXcsNuDhYx//Pt2LjkB8H64Vu/B9WOJr2IV12AW+D8ezzpHuS5P8TFU9kuQDSV5RVX966Ubdfaa7D7r7YH9/f+ExAWD7HRnd7v6t7r6tu08leW2Sv+vu1298MgDYMX5PFwCGHPmc7sW6++NJPr6RSQBgxznSBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwJC9kx5gQtXx79O9/GOctJP+Gp7O5196hiWc9DrCkuzPsxzpAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYMiR0a2q26vqY1X1cFU9VFV3TQwGALtmb41tvpHk17v7/qp6bpJzVfXR7n54w7MBwE458ki3ux/r7vtXb389yfkkt256MADYNcd6TreqTiV5cZL7LvOx01V1tqrOXrhwYZnpAGCHrB3dqnpOkg8meXN3f+3Sj3f3me4+6O6D/f39JWcEgJ2wVnSr6lk5DO7d3X3vZkcCgN20zquXK8l7kpzv7ndsfiQA2E3rHOm+NMkvJHlFVT2w+vOqDc8FADvnyF8Z6u5/TFIDswDATnNFKgAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhuyd9ACsp+r49+lefo5tZx2XYR2XYR2XsU3r6EgXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ9aKblX9RFV9uqo+U1Vv2fRQALCLjoxuVd2Q5A+T/GSSFyZ5XVW9cNODAcCuWedI9weTfKa7P9vdTyb5QJJXb3YsANg960T31iRfuOj9L65uAwCOYW+pB6qq00lOr959oqo+vdRjX8UtSb5y+Xme2QM/0/tfCzMc4/7WcZn7X7PruGXfh8uu45Z9DdfCDDu7jsNfwxV/ri/x3es82DrRfTTJ7Re9f9vqtv+ju88kObPOJ11KVZ3t7oPJz7mLrOMyrOMyrOMyrOMyll7HdU4vfyLJ91XVC6rq2Ulem+RDSw0AANeLI490u/sbVfWmJH+b5IYk7+3uhzY+GQDsmLWe0+3uDyf58IZneTpGT2fvMOu4DOu4DOu4DOu4jEXXsbp7yccDAK7AZSABYIjoAsCQrY2u60Evo6oeqapPVdUDVXX2pOfZFlX13qp6vKoevOi276yqj1bVv67+vvkkZ9wGV1jHt1XVo6t98oGqetVJznitq6rbq+pjVfVwVT1UVXetbrc/HsNV1nHR/XErn9NdXQ/6X5L8WA6vkPWJJK/r7odPdLAtVFWPJDno7nV++ZuVqnp5kieS/El3v2h12+8l+Wp3/+7qH4I3d/dvnuSc17orrOPbkjzR3W8/ydm2RVU9P8nzu/v+qnpuknNJfjbJG2N/XNtV1vE1WXB/3NYjXdeD5kR1998n+eolN786yftWb78vhz+wXMUV1pFj6O7Huvv+1dtfT3I+h5fqtT8ew1XWcVHbGl3Xg15OJ/lIVZ1bXcqTp+953f3Y6u0vJXneSQ6z5d5UVf+0Ov3stOiaqupUkhcnuS/2x6ftknVMFtwftzW6LOdl3f2SHP7Xjb+8Ot3HM9SHz9ts33M314Z3JfneJD+Q5LEkv3+y42yHqnpOkg8meXN3f+3ij9kf13eZdVx0f9zW6K51PWiO1t2Prv5+PMlf5PDUPU/Pl1fPC33z+aHHT3ierdTdX+7u/+7up5L8UeyTR6qqZ+UwFHd3972rm+2Px3S5dVx6f9zW6Loe9AKq6qbVCwZSVTcl+fEkD179XlzFh5K8YfX2G5L81QnOsrW+GYqVn4t98qqqqpK8J8n57n7HRR+yPx7DldZx6f1xK1+9nCSrl22/M/97PejfOeGRtk5VfU8Oj26Tw0uC/pl1XE9VvT/JnTn8b7++nOStSf4yyZ8nuSPJ55K8pru9SOgqrrCOd+bwVF4neSTJL1303CSXqKqXJfmHJJ9K8tTq5t/O4fOR9sc1XWUdX5cF98etjS4AbJttPb0MAFtHdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAM+R8ehKbWpEhdRgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "% matplotlib inline\n", + "\n", + "# counts\n", + "group = ['Task', 'Category']\n", + "counts = plagiarism_df.groupby(group).size().reset_index(name=\"Counts\")\n", + "\n", + "plt.figure(figsize=(8,5))\n", + "plt.bar(range(len(counts)), counts['Counts'], color = 'blue')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Up Next\n", + "\n", + "This notebook is just about data loading and exploration, and you do not need to include it in your final project submission. \n", + "\n", + "In the next few notebooks, you'll use this data to train a complete plagiarism classifier. You'll be tasked with extracting meaningful features from the text data, reading in answers to different tasks and comparing them to the original Wikipedia source text. You'll engineer similarity features that will help identify cases of plagiarism. Then, you'll use these features to train and deploy a classification model in a SageMaker notebook instance. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_amazonei_mxnet_p36", + "language": "python", + "name": "conda_amazonei_mxnet_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/.ipynb_checkpoints/2_Plagiarism_Feature_Engineering-checkpoint.ipynb b/.ipynb_checkpoints/2_Plagiarism_Feature_Engineering-checkpoint.ipynb new file mode 100644 index 0000000..9a32e37 --- /dev/null +++ b/.ipynb_checkpoints/2_Plagiarism_Feature_Engineering-checkpoint.ipynb @@ -0,0 +1,2362 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plagiarism Detection, Feature Engineering\n", + "\n", + "In this project, you will be tasked with building a plagiarism detector that examines an answer text file and performs binary classification; labeling that file as either plagiarized or not, depending on how similar that text file is to a provided, source text. \n", + "\n", + "Your first task will be to create some features that can then be used to train a classification model. This task will be broken down into a few discrete steps:\n", + "\n", + "* Clean and pre-process the data.\n", + "* Define features for comparing the similarity of an answer text and a source text, and extract similarity features.\n", + "* Select \"good\" features, by analyzing the correlations between different features.\n", + "* Create train/test `.csv` files that hold the relevant features and class labels for train/test data points.\n", + "\n", + "In the _next_ notebook, Notebook 3, you'll use the features and `.csv` files you create in _this_ notebook to train a binary classification model in a SageMaker notebook instance.\n", + "\n", + "You'll be defining a few different similarity features, as outlined in [this paper](https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c412841_developing-a-corpus-of-plagiarised-short-answers/developing-a-corpus-of-plagiarised-short-answers.pdf), which should help you build a robust plagiarism detector!\n", + "\n", + "To complete this notebook, you'll have to complete all given exercises and answer all the questions in this notebook.\n", + "> All your tasks will be clearly labeled **EXERCISE** and questions as **QUESTION**.\n", + "\n", + "It will be up to you to decide on the features to include in your final training and test data.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read in the Data\n", + "\n", + "The cell below will download the necessary, project data and extract the files into the folder `data/`.\n", + "\n", + "This data is a slightly modified version of a dataset created by Paul Clough (Information Studies) and Mark Stevenson (Computer Science), at the University of Sheffield. You can read all about the data collection and corpus, at [their university webpage](https://ir.shef.ac.uk/cloughie/resources/plagiarism_corpus.html). \n", + "\n", + "> **Citation for data**: Clough, P. and Stevenson, M. Developing A Corpus of Plagiarised Short Answers, Language Resources and Evaluation: Special Issue on Plagiarism and Authorship Analysis, In Press. [Download]" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE:\n", + "# you only need to run this cell if you have not yet downloaded the data\n", + "# otherwise you may skip this cell or comment it out\n", + "\n", + "#!wget https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c4147f9_data/data.zip\n", + "#!unzip data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This plagiarism dataset is made of multiple text files; each of these files has characteristics that are is summarized in a `.csv` file named `file_information.csv`, which we can read in using `pandas`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategory
0g0pA_taska.txtanon
1g0pA_taskb.txtbcut
2g0pA_taskc.txtclight
3g0pA_taskd.txtdheavy
4g0pA_taske.txtenon
\n", + "
" + ], + "text/plain": [ + " File Task Category\n", + "0 g0pA_taska.txt a non\n", + "1 g0pA_taskb.txt b cut\n", + "2 g0pA_taskc.txt c light\n", + "3 g0pA_taskd.txt d heavy\n", + "4 g0pA_taske.txt e non" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_file = 'data/file_information.csv'\n", + "plagiarism_df = pd.read_csv(csv_file)\n", + "\n", + "# print out the first few rows of data info\n", + "plagiarism_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Types of Plagiarism\n", + "\n", + "Each text file is associated with one **Task** (task A-E) and one **Category** of plagiarism, which you can see in the above DataFrame.\n", + "\n", + "### Tasks, A-E\n", + "\n", + "Each text file contains an answer to one short question; these questions are labeled as tasks A-E. For example, Task A asks the question: \"What is inheritance in object oriented programming?\"\n", + "\n", + "### Categories of plagiarism \n", + "\n", + "Each text file has an associated plagiarism label/category:\n", + "\n", + "**1. Plagiarized categories: `cut`, `light`, and `heavy`.**\n", + "* These categories represent different levels of plagiarized answer texts. `cut` answers copy directly from a source text, `light` answers are based on the source text but include some light rephrasing, and `heavy` answers are based on the source text, but *heavily* rephrased (and will likely be the most challenging kind of plagiarism to detect).\n", + " \n", + "**2. Non-plagiarized category: `non`.** \n", + "* `non` indicates that an answer is not plagiarized; the Wikipedia source text is not used to create this answer.\n", + " \n", + "**3. Special, source text category: `orig`.**\n", + "* This is a specific category for the original, Wikipedia source text. We will use these files only for comparison purposes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Pre-Process the Data\n", + "\n", + "In the next few cells, you'll be tasked with creating a new DataFrame of desired information about all of the files in the `data/` directory. This will prepare the data for feature extraction and for training a binary, plagiarism classifier." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EXERCISE: Convert categorical to numerical data\n", + "\n", + "You'll notice that the `Category` column in the data, contains string or categorical values, and to prepare these for feature extraction, we'll want to convert these into numerical values. Additionally, our goal is to create a binary classifier and so we'll need a binary class label that indicates whether an answer text is plagiarized (1) or not (0). Complete the below function `numerical_dataframe` that reads in a `file_information.csv` file by name, and returns a *new* DataFrame with a numerical `Category` column and a new `Class` column that labels each answer as plagiarized or not. \n", + "\n", + "Your function should return a new DataFrame with the following properties:\n", + "\n", + "* 4 columns: `File`, `Task`, `Category`, `Class`. The `File` and `Task` columns can remain unchanged from the original `.csv` file.\n", + "* Convert all `Category` labels to numerical labels according to the following rules (a higher value indicates a higher degree of plagiarism):\n", + " * 0 = `non`\n", + " * 1 = `heavy`\n", + " * 2 = `light`\n", + " * 3 = `cut`\n", + " * -1 = `orig`, this is a special value that indicates an original file.\n", + "* For the new `Class` column\n", + " * Any answer text that is not plagiarized (`non`) should have the class label `0`. \n", + " * Any plagiarized answer texts should have the class label `1`. \n", + " * And any `orig` texts will have a special label `-1`. \n", + "\n", + "### Expected output\n", + "\n", + "After running your function, you should get a DataFrame with rows that looks like the following: \n", + "```\n", + "\n", + " File\t Task Category Class\n", + "0\tg0pA_taska.txt\ta\t 0 \t0\n", + "1\tg0pA_taskb.txt\tb\t 3 \t1\n", + "2\tg0pA_taskc.txt\tc\t 2 \t1\n", + "3\tg0pA_taskd.txt\td\t 1 \t1\n", + "4\tg0pA_taske.txt\te\t 0\t 0\n", + "...\n", + "...\n", + "99 orig_taske.txt e -1 -1\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Read in a csv file and return a transformed dataframe\n", + "def numerical_dataframe(csv_file='data/file_information.csv'):\n", + " '''Reads in a csv file which is assumed to have `File`, `Category` and `Task` columns.\n", + " This function does two things: \n", + " 1) converts `Category` column values to numerical values \n", + " 2) Adds a new, numerical `Class` label column.\n", + " The `Class` column will label plagiarized answers as 1 and non-plagiarized as 0.\n", + " Source texts have a special label, -1.\n", + " :param csv_file: The directory for the file_information.csv file\n", + " :return: A dataframe with numerical categories and a new `Class` label column'''\n", + " \n", + " # your code here\n", + " category_to_numerical = {'non': 0, 'heavy': 1, 'light': 2, 'cut': 3, 'orig': -1 }\n", + " numerical_to_class = {'non': 0, 'heavy': 1, 'light': 1, 'cut': 1, 'orig': -1}\n", + " df = pd.read_csv(csv_file)\n", + " class_list = []\n", + " category_list = []\n", + " for index, row in df.iterrows():\n", + " category_list.append(category_to_numerical[row['Category']])\n", + " class_list.append(numerical_to_class[row['Category']])\n", + " df['Category'] = category_list\n", + " df['Class'] = class_list\n", + " return df\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cells\n", + "\n", + "Below are a couple of test cells. The first is an informal test where you can check that your code is working as expected by calling your function and printing out the returned result.\n", + "\n", + "The **second** cell below is a more rigorous test cell. The goal of a cell like this is to ensure that your code is working as expected, and to form any variables that might be used in _later_ tests/code, in this case, the data frame, `transformed_df`.\n", + "\n", + "> The cells in this notebook should be run in chronological order (the order they appear in the notebook). This is especially important for test cells.\n", + "\n", + "Often, later cells rely on the functions, imports, or variables defined in earlier cells. For example, some tests rely on previous tests to work.\n", + "\n", + "These tests do not test all cases, but they are a great way to check that you are on the right track!" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategoryClass
0g0pA_taska.txta00
1g0pA_taskb.txtb31
2g0pA_taskc.txtc21
3g0pA_taskd.txtd11
4g0pA_taske.txte00
5g0pB_taska.txta00
6g0pB_taskb.txtb00
7g0pB_taskc.txtc31
8g0pB_taskd.txtd21
9g0pB_taske.txte11
\n", + "
" + ], + "text/plain": [ + " File Task Category Class\n", + "0 g0pA_taska.txt a 0 0\n", + "1 g0pA_taskb.txt b 3 1\n", + "2 g0pA_taskc.txt c 2 1\n", + "3 g0pA_taskd.txt d 1 1\n", + "4 g0pA_taske.txt e 0 0\n", + "5 g0pB_taska.txt a 0 0\n", + "6 g0pB_taskb.txt b 0 0\n", + "7 g0pB_taskc.txt c 3 1\n", + "8 g0pB_taskd.txt d 2 1\n", + "9 g0pB_taske.txt e 1 1" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# informal testing, print out the results of a called function\n", + "# create new `transformed_df`\n", + "transformed_df = numerical_dataframe(csv_file ='data/file_information.csv')\n", + "\n", + "# check work\n", + "# check that all categories of plagiarism have a class label = 1\n", + "transformed_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tests Passed!\n", + "\n", + "Example data: \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategoryClass
0g0pA_taska.txta00
1g0pA_taskb.txtb31
2g0pA_taskc.txtc21
3g0pA_taskd.txtd11
4g0pA_taske.txte00
\n", + "
" + ], + "text/plain": [ + " File Task Category Class\n", + "0 g0pA_taska.txt a 0 0\n", + "1 g0pA_taskb.txt b 3 1\n", + "2 g0pA_taskc.txt c 2 1\n", + "3 g0pA_taskd.txt d 1 1\n", + "4 g0pA_taske.txt e 0 0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# test cell that creates `transformed_df`, if tests are passed\n", + "\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "\n", + "# importing tests\n", + "import problem_unittests as tests\n", + "\n", + "# test numerical_dataframe function\n", + "tests.test_numerical_df(numerical_dataframe)\n", + "\n", + "# if above test is passed, create NEW `transformed_df`\n", + "transformed_df = numerical_dataframe(csv_file ='data/file_information.csv')\n", + "\n", + "# check work\n", + "print('\\nExample data: ')\n", + "transformed_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Text Processing & Splitting Data\n", + "\n", + "Recall that the goal of this project is to build a plagiarism classifier. At it's heart, this task is a comparison text; one that looks at a given answer and a source text, compares them and predicts whether an answer has plagiarized from the source. To effectively do this comparison, and train a classifier we'll need to do a few more things: pre-process all of our text data and prepare the text files (in this case, the 95 answer files and 5 original source files) to be easily compared, and split our data into a `train` and `test` set that can be used to train a classifier and evaluate it, respectively. \n", + "\n", + "To this end, you've been provided code that adds additional information to your `transformed_df` from above. The next two cells need not be changed; they add two additional columns to the `transformed_df`:\n", + "\n", + "1. A `Text` column; this holds all the lowercase text for a `File`, with extraneous punctuation removed.\n", + "2. A `Datatype` column; this is a string value `train`, `test`, or `orig` that labels a data point as part of our train or test set\n", + "\n", + "The details of how these additional columns are created can be found in the `helpers.py` file in the project directory. You're encouraged to read through that file to see exactly how text is processed and how data is split.\n", + "\n", + "Run the cells below to get a `complete_df` that has all the information you need to proceed with plagiarism detection and feature engineering." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategoryClassText
0g0pA_taska.txta00inheritance is a basic concept of object orien...
1g0pA_taskb.txtb31pagerank is a link analysis algorithm used by ...
2g0pA_taskc.txtc21the vector space model also called term vector...
3g0pA_taskd.txtd11bayes theorem was names after rev thomas bayes...
4g0pA_taske.txte00dynamic programming is an algorithm design tec...
\n", + "
" + ], + "text/plain": [ + " File Task Category Class \\\n", + "0 g0pA_taska.txt a 0 0 \n", + "1 g0pA_taskb.txt b 3 1 \n", + "2 g0pA_taskc.txt c 2 1 \n", + "3 g0pA_taskd.txt d 1 1 \n", + "4 g0pA_taske.txt e 0 0 \n", + "\n", + " Text \n", + "0 inheritance is a basic concept of object orien... \n", + "1 pagerank is a link analysis algorithm used by ... \n", + "2 the vector space model also called term vector... \n", + "3 bayes theorem was names after rev thomas bayes... \n", + "4 dynamic programming is an algorithm design tec... " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "import helpers \n", + "\n", + "# create a text column \n", + "text_df = helpers.create_text_column(transformed_df)\n", + "text_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sample processed text:\n", + "\n", + " inheritance is a basic concept of object oriented programming where the basic idea is to create new classes that add extra detail to existing classes this is done by allowing the new classes to reuse the methods and variables of the existing classes and new methods and classes are added to specialise the new class inheritance models the is kind of relationship between entities or objects for example postgraduates and undergraduates are both kinds of student this kind of relationship can be visualised as a tree structure where student would be the more general root node and both postgraduate and undergraduate would be more specialised extensions of the student node or the child nodes in this relationship student would be known as the superclass or parent class whereas postgraduate would be known as the subclass or child class because the postgraduate class extends the student class inheritance can occur on several layers where if visualised would display a larger tree structure for example we could further extend the postgraduate node by adding two extra extended classes to it called msc student and phd student as both these types of student are kinds of postgraduate student this would mean that both the msc student and phd student classes would inherit methods and variables from both the postgraduate and student classes \n" + ] + } + ], + "source": [ + "# after running the cell above\n", + "# check out the processed text for a single file, by row index\n", + "row_idx = 0 # feel free to change this index\n", + "\n", + "sample_text = text_df.iloc[0]['Text']\n", + "\n", + "print('Sample processed text:\\n\\n', sample_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split data into training and test sets\n", + "\n", + "The next cell will add a `Datatype` column to a given DataFrame to indicate if the record is: \n", + "* `train` - Training data, for model training.\n", + "* `test` - Testing data, for model evaluation.\n", + "* `orig` - The task's original answer from wikipedia.\n", + "\n", + "### Stratified sampling\n", + "\n", + "The given code uses a helper function which you can view in the `helpers.py` file in the main project directory. This implements [stratified random sampling](https://en.wikipedia.org/wiki/Stratified_sampling) to randomly split data by task & plagiarism amount. Stratified sampling ensures that we get training and test data that is fairly evenly distributed across task & plagiarism combinations. Approximately 26% of the data is held out for testing and 74% of the data is used for training.\n", + "\n", + "The function **train_test_dataframe** takes in a DataFrame that it assumes has `Task` and `Category` columns, and, returns a modified frame that indicates which `Datatype` (train, test, or orig) a file falls into. This sampling will change slightly based on a passed in *random_seed*. Due to a small sample size, this stratified random sampling will provide more stable results for a binary plagiarism classifier. Stability here is smaller *variance* in the accuracy of classifier, given a random seed." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategoryClassTextDatatype
0g0pA_taska.txta00inheritance is a basic concept of object orien...train
1g0pA_taskb.txtb31pagerank is a link analysis algorithm used by ...test
2g0pA_taskc.txtc21the vector space model also called term vector...train
3g0pA_taskd.txtd11bayes theorem was names after rev thomas bayes...train
4g0pA_taske.txte00dynamic programming is an algorithm design tec...train
5g0pB_taska.txta00inheritance is a basic concept in object orien...train
6g0pB_taskb.txtb00pagerank pr refers to both the concept and the...train
7g0pB_taskc.txtc31vector space model is an algebraic model for r...test
8g0pB_taskd.txtd21bayes theorem relates the conditional and marg...train
9g0pB_taske.txte11dynamic programming is a method for solving ma...test
\n", + "
" + ], + "text/plain": [ + " File Task Category Class \\\n", + "0 g0pA_taska.txt a 0 0 \n", + "1 g0pA_taskb.txt b 3 1 \n", + "2 g0pA_taskc.txt c 2 1 \n", + "3 g0pA_taskd.txt d 1 1 \n", + "4 g0pA_taske.txt e 0 0 \n", + "5 g0pB_taska.txt a 0 0 \n", + "6 g0pB_taskb.txt b 0 0 \n", + "7 g0pB_taskc.txt c 3 1 \n", + "8 g0pB_taskd.txt d 2 1 \n", + "9 g0pB_taske.txt e 1 1 \n", + "\n", + " Text Datatype \n", + "0 inheritance is a basic concept of object orien... train \n", + "1 pagerank is a link analysis algorithm used by ... test \n", + "2 the vector space model also called term vector... train \n", + "3 bayes theorem was names after rev thomas bayes... train \n", + "4 dynamic programming is an algorithm design tec... train \n", + "5 inheritance is a basic concept in object orien... train \n", + "6 pagerank pr refers to both the concept and the... train \n", + "7 vector space model is an algebraic model for r... test \n", + "8 bayes theorem relates the conditional and marg... train \n", + "9 dynamic programming is a method for solving ma... test " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "random_seed = 1 # can change; set for reproducibility\n", + "\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "import helpers\n", + "\n", + "# create new df with Datatype (train, test, orig) column\n", + "# pass in `text_df` from above to create a complete dataframe, with all the information you need\n", + "complete_df = helpers.train_test_dataframe(text_df, random_seed=random_seed)\n", + "\n", + "# check results\n", + "complete_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Determining Plagiarism\n", + "\n", + "Now that you've prepared this data and created a `complete_df` of information, including the text and class associated with each file, you can move on to the task of extracting similarity features that will be useful for plagiarism classification. \n", + "\n", + "> Note: The following code exercises, assume that the `complete_df` as it exists now, will **not** have its existing columns modified. \n", + "\n", + "The `complete_df` should always include the columns: `['File', 'Task', 'Category', 'Class', 'Text', 'Datatype']`. You can add additional columns, and you can create any new DataFrames you need by copying the parts of the `complete_df` as long as you do not modify the existing values, directly.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Similarity Features \n", + "\n", + "One of the ways we might go about detecting plagiarism, is by computing **similarity features** that measure how similar a given answer text is as compared to the original wikipedia source text (for a specific task, a-e). The similarity features you will use are informed by [this paper on plagiarism detection](https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c412841_developing-a-corpus-of-plagiarised-short-answers/developing-a-corpus-of-plagiarised-short-answers.pdf). \n", + "> In this paper, researchers created features called **containment** and **longest common subsequence**. \n", + "\n", + "Using these features as input, you will train a model to distinguish between plagiarized and not-plagiarized text files.\n", + "\n", + "## Feature Engineering\n", + "\n", + "Let's talk a bit more about the features we want to include in a plagiarism detection model and how to calculate such features. In the following explanations, I'll refer to a submitted text file as a **Student Answer Text (A)** and the original, wikipedia source file (that we want to compare that answer to) as the **Wikipedia Source Text (S)**.\n", + "\n", + "### Containment\n", + "\n", + "Your first task will be to create **containment features**. To understand containment, let's first revisit a definition of [n-grams](https://en.wikipedia.org/wiki/N-gram). An *n-gram* is a sequential word grouping. For example, in a line like \"bayes rule gives us a way to combine prior knowledge with new information,\" a 1-gram is just one word, like \"bayes.\" A 2-gram might be \"bayes rule\" and a 3-gram might be \"combine prior knowledge.\"\n", + "\n", + "> Containment is defined as the **intersection** of the n-gram word count of the Wikipedia Source Text (S) with the n-gram word count of the Student Answer Text (S) *divided* by the n-gram word count of the Student Answer Text.\n", + "\n", + "$$ \\frac{\\sum{count(\\text{ngram}_{A}) \\cap count(\\text{ngram}_{S})}}{\\sum{count(\\text{ngram}_{A})}} $$\n", + "\n", + "If the two texts have no n-grams in common, the containment will be 0, but if _all_ their n-grams intersect then the containment will be 1. Intuitively, you can see how having longer n-gram's in common, might be an indication of cut-and-paste plagiarism. In this project, it will be up to you to decide on the appropriate `n` or several `n`'s to use in your final model.\n", + "\n", + "### EXERCISE: Create containment features\n", + "\n", + "Given the `complete_df` that you've created, you should have all the information you need to compare any Student Answer Text (A) with its appropriate Wikipedia Source Text (S). An answer for task A should be compared to the source text for task A, just as answers to tasks B, C, D, and E should be compared to the corresponding original source text.\n", + "\n", + "In this exercise, you'll complete the function, `calculate_containment` which calculates containment based upon the following parameters:\n", + "* A given DataFrame, `df` (which is assumed to be the `complete_df` from above)\n", + "* An `answer_filename`, such as 'g0pB_taskd.txt' \n", + "* An n-gram length, `n`\n", + "\n", + "### Containment calculation\n", + "\n", + "The general steps to complete this function are as follows:\n", + "1. From *all* of the text files in a given `df`, create an array of n-gram counts; it is suggested that you use a [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) for this purpose.\n", + "2. Get the processed answer and source texts for the given `answer_filename`.\n", + "3. Calculate the containment between an answer and source text according to the following equation.\n", + "\n", + " >$$ \\frac{\\sum{count(\\text{ngram}_{A}) \\cap count(\\text{ngram}_{S})}}{\\sum{count(\\text{ngram}_{A})}} $$\n", + " \n", + "4. Return that containment value.\n", + "\n", + "You are encouraged to write any helper functions that you need to complete the function below." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "# Calculate the ngram containment for one answer file/source file pair in a df\n", + "def calculate_containment(df, n, answer_filename):\n", + " '''Calculates the containment between a given answer text and its associated source text.\n", + " This function creates a count of ngrams (of a size, n) for each text file in our data.\n", + " Then calculates the containment by finding the ngram count for a given answer text, \n", + " and its associated source text, and calculating the normalized intersection of those counts.\n", + " :param df: A dataframe with columns,\n", + " 'File', 'Task', 'Category', 'Class', 'Text', and 'Datatype'\n", + " :param n: An integer that defines the ngram size\n", + " :param answer_filename: A filename for an answer text in the df, ex. 'g0pB_taskd.txt'\n", + " :return: A single containment value that represents the similarity\n", + " between an answer text and its source text.\n", + " '''\n", + " \n", + " # your code here\n", + " row = df.loc[df['File'] == answer_filename]\n", + " answer = row['File']\n", + " answer_location = answer.index.item()\n", + " \n", + " source = df[(df['Task'] == df.iloc[answer_location]['Task']) & (df['Category'] == -1)]\n", + " source_location = source.index.item()\n", + " \n", + " counts = CountVectorizer(analyzer='word', ngram_range=(n,n))\n", + " ngrams = counts.fit_transform(df['Text'])\n", + " \n", + " ngram_array = ngrams.toarray()\n", + " answer_and_source = ngram_array[(answer_location, source_location),]\n", + " \n", + " sum_intersection_ngrams = np.sum(np.min(answer_and_source, axis=0))\n", + " containment = sum_intersection_ngrams / np.sum(answer_and_source[0])\n", + " \n", + " return containment\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cells\n", + "\n", + "After you've implemented the containment function, you can test out its behavior. \n", + "\n", + "The cell below iterates through the first few files, and calculates the original category _and_ containment values for a specified n and file.\n", + "\n", + ">If you've implemented this correctly, you should see that the non-plagiarized have low or close to 0 containment values and that plagiarized examples have higher containment values, closer to 1.\n", + "\n", + "Note what happens when you change the value of n. I recommend applying your code to multiple files and comparing the resultant containment values. You should see that the highest containment values correspond to files with the highest category (`cut`) of plagiarism level." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original category values: \n", + " [0, 3, 2, 1, 0]\n", + "\n", + "3-gram containment values: \n", + " [0.009345794392523364, 0.9641025641025641, 0.6136363636363636, 0.15675675675675677, 0.031746031746031744]\n" + ] + } + ], + "source": [ + "# select a value for n\n", + "n = 3\n", + "\n", + "# indices for first few files\n", + "test_indices = range(5)\n", + "\n", + "# iterate through files and calculate containment\n", + "category_vals = []\n", + "containment_vals = []\n", + "for i in test_indices:\n", + " # get level of plagiarism for a given file index\n", + " category_vals.append(complete_df.loc[i, 'Category'])\n", + " # calculate containment for given file and n\n", + " filename = complete_df.loc[i, 'File']\n", + " c = calculate_containment(complete_df, n, filename)\n", + " containment_vals.append(c)\n", + "\n", + "# print out result, does it make sense?\n", + "print('Original category values: \\n', category_vals)\n", + "print()\n", + "print(str(n)+'-gram containment values: \\n', containment_vals)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tests Passed!\n" + ] + } + ], + "source": [ + "# run this test cell\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# test containment calculation\n", + "# params: complete_df from before, and containment function\n", + "tests.test_containment(complete_df, calculate_containment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### QUESTION 1: Why can we calculate containment features across *all* data (training & test), prior to splitting the DataFrame for modeling? That is, what about the containment calculation means that the test and training data do not influence each other?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Answer:** To make the model as accurate as possible, we train it on both training and test datasets. This also eliminates the condition of missing out important words that might signify plagiarism.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Longest Common Subsequence\n", + "\n", + "Containment a good way to find overlap in word usage between two documents; it may help identify cases of cut-and-paste as well as paraphrased levels of plagiarism. Since plagiarism is a fairly complex task with varying levels, it's often useful to include other measures of similarity. The paper also discusses a feature called **longest common subsequence**.\n", + "\n", + "> The longest common subsequence is the longest string of words (or letters) that are *the same* between the Wikipedia Source Text (S) and the Student Answer Text (A). This value is also normalized by dividing by the total number of words (or letters) in the Student Answer Text. \n", + "\n", + "In this exercise, we'll ask you to calculate the longest common subsequence of words between two texts.\n", + "\n", + "### EXERCISE: Calculate the longest common subsequence\n", + "\n", + "Complete the function `lcs_norm_word`; this should calculate the *longest common subsequence* of words between a Student Answer Text and corresponding Wikipedia Source Text. \n", + "\n", + "It may be helpful to think of this in a concrete example. A Longest Common Subsequence (LCS) problem may look as follows:\n", + "* Given two texts: text A (answer text) of length n, and string S (original source text) of length m. Our goal is to produce their longest common subsequence of words: the longest sequence of words that appear left-to-right in both texts (though the words don't have to be in continuous order).\n", + "* Consider:\n", + " * A = \"i think pagerank is a link analysis algorithm used by google that uses a system of weights attached to each element of a hyperlinked set of documents\"\n", + " * S = \"pagerank is a link analysis algorithm used by the google internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents\"\n", + "\n", + "* In this case, we can see that the start of each sentence of fairly similar, having overlap in the sequence of words, \"pagerank is a link analysis algorithm used by\" before diverging slightly. Then we **continue moving left -to-right along both texts** until we see the next common sequence; in this case it is only one word, \"google\". Next we find \"that\" and \"a\" and finally the same ending \"to each element of a hyperlinked set of documents\".\n", + "* Below, is a clear visual of how these sequences were found, sequentially, in each text.\n", + "\n", + "\n", + "\n", + "* Now, those words appear in left-to-right order in each document, sequentially, and even though there are some words in between, we count this as the longest common subsequence between the two texts. \n", + "* If I count up each word that I found in common I get the value 20. **So, LCS has length 20**. \n", + "* Next, to normalize this value, divide by the total length of the student answer; in this example that length is only 27. **So, the function `lcs_norm_word` should return the value `20/27` or about `0.7408`.**\n", + "\n", + "In this way, LCS is a great indicator of cut-and-paste plagiarism or if someone has referenced the same source text multiple times in an answer." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### LCS, dynamic programming\n", + "\n", + "If you read through the scenario above, you can see that this algorithm depends on looking at two texts and comparing them word by word. You can solve this problem in multiple ways. First, it may be useful to `.split()` each text into lists of comma separated words to compare. Then, you can iterate through each word in the texts and compare them, adding to your value for LCS as you go. \n", + "\n", + "The method I recommend for implementing an efficient LCS algorithm is: using a matrix and dynamic programming. **Dynamic programming** is all about breaking a larger problem into a smaller set of subproblems, and building up a complete result without having to repeat any subproblems. \n", + "\n", + "This approach assumes that you can split up a large LCS task into a combination of smaller LCS tasks. Let's look at a simple example that compares letters:\n", + "\n", + "* A = \"ABCD\"\n", + "* S = \"BD\"\n", + "\n", + "We can see right away that the longest subsequence of _letters_ here is 2 (B and D are in sequence in both strings). And we can calculate this by looking at relationships between each letter in the two strings, A and S.\n", + "\n", + "Here, I have a matrix with the letters of A on top and the letters of S on the left side:\n", + "\n", + "\n", + "\n", + "This starts out as a matrix that has as many columns and rows as letters in the strings S and O **+1** additional row and column, filled with zeros on the top and left sides. So, in this case, instead of a 2x4 matrix it is a 3x5.\n", + "\n", + "Now, we can fill this matrix up by breaking it into smaller LCS problems. For example, let's first look at the shortest substrings: the starting letter of A and S. We'll first ask, what is the Longest Common Subsequence between these two letters \"A\" and \"B\"? \n", + "\n", + "**Here, the answer is zero and we fill in the corresponding grid cell with that value.**\n", + "\n", + "\n", + "\n", + "Then, we ask the next question, what is the LCS between \"AB\" and \"B\"?\n", + "\n", + "**Here, we have a match, and can fill in the appropriate value 1**.\n", + "\n", + "\n", + "\n", + "If we continue, we get to a final matrix that looks as follows, with a **2** in the bottom right corner.\n", + "\n", + "\n", + "\n", + "The final LCS will be that value **2** *normalized* by the number of n-grams in A. So, our normalized value is 2/4 = **0.5**.\n", + "\n", + "### The matrix rules\n", + "\n", + "One thing to notice here is that, you can efficiently fill up this matrix one cell at a time. Each grid cell only depends on the values in the grid cells that are directly on top and to the left of it, or on the diagonal/top-left. The rules are as follows:\n", + "* Start with a matrix that has one extra row and column of zeros.\n", + "* As you traverse your string:\n", + " * If there is a match, fill that grid cell with the value to the top-left of that cell *plus* one. So, in our case, when we found a matching B-B, we added +1 to the value in the top-left of the matching cell, 0.\n", + " * If there is not a match, take the *maximum* value from either directly to the left or the top cell, and carry that value over to the non-match cell.\n", + "\n", + "\n", + "\n", + "After completely filling the matrix, **the bottom-right cell will hold the non-normalized LCS value**.\n", + "\n", + "This matrix treatment can be applied to a set of words instead of letters. Your function should apply this to the words in two texts and return the normalized LCS value." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the normalized LCS given an answer text and a source text\n", + "def lcs_norm_word(answer_text, source_text):\n", + " '''Computes the longest common subsequence of words in two texts; returns a normalized value.\n", + " :param answer_text: The pre-processed text for an answer text\n", + " :param source_text: The pre-processed text for an answer's associated source text\n", + " :return: A normalized LCS value'''\n", + " \n", + " answer = answer_text.split()\n", + " source = source_text.split()\n", + " \n", + " lcs_matrix = np.zeros((len(answer) + 1, len(source) + 1))\n", + " row_index= 0\n", + " col_index = 0\n", + " for row_index in range(0, len(answer)):\n", + " answer_word = answer[row_index]\n", + " for col_index in range(0, len(source)):\n", + " source_word = source[col_index]\n", + " if source_word == answer_word:\n", + " lcs_matrix[row_index + 1][col_index + 1] = (lcs_matrix[row_index][col_index]) + 1\n", + " else: \n", + " lcs_matrix[row_index + 1][col_index + 1] = max(lcs_matrix[row_index][col_index + 1], \n", + " lcs_matrix[row_index + 1][col_index])\n", + "\n", + " normalized_lcs = lcs_matrix[len(answer)][len(source)] / len(answer)\n", + " print(normalized_lcs)\n", + " return normalized_lcs\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cells\n", + "\n", + "Let's start by testing out your code on the example given in the initial description.\n", + "\n", + "In the below cell, we have specified strings A (answer text) and S (original source text). We know that these texts have 20 words in common and the submitted answer is 27 words long, so the normalized, longest common subsequence should be 20/27.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7407407407407407\n", + "LCS = 0.7407407407407407\n", + "Test passed!\n" + ] + } + ], + "source": [ + "# Run the test scenario from above\n", + "# does your function return the expected value?\n", + "\n", + "A = \"i think pagerank is a link analysis algorithm used by google that uses a system of weights attached to each element of a hyperlinked set of documents\"\n", + "S = \"pagerank is a link analysis algorithm used by the google internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents\"\n", + "\n", + "# calculate LCS\n", + "lcs = lcs_norm_word(A, S)\n", + "print('LCS = ', lcs)\n", + "\n", + "\n", + "# expected value test\n", + "assert lcs==20/27., \"Incorrect LCS value, expected about 0.7408, got \"+str(lcs)\n", + "\n", + "print('Test passed!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This next cell runs a more rigorous test." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategoryClassTextDatatype
0g0pA_taska.txta00inheritance is a basic concept of object orien...train
1g0pA_taskb.txtb31pagerank is a link analysis algorithm used by ...test
2g0pA_taskc.txtc21the vector space model also called term vector...train
3g0pA_taskd.txtd11bayes theorem was names after rev thomas bayes...train
4g0pA_taske.txte00dynamic programming is an algorithm design tec...train
\n", + "
" + ], + "text/plain": [ + " File Task Category Class \\\n", + "0 g0pA_taska.txt a 0 0 \n", + "1 g0pA_taskb.txt b 3 1 \n", + "2 g0pA_taskc.txt c 2 1 \n", + "3 g0pA_taskd.txt d 1 1 \n", + "4 g0pA_taske.txt e 0 0 \n", + "\n", + " Text Datatype \n", + "0 inheritance is a basic concept of object orien... train \n", + "1 pagerank is a link analysis algorithm used by ... test \n", + "2 the vector space model also called term vector... train \n", + "3 bayes theorem was names after rev thomas bayes... train \n", + "4 dynamic programming is an algorithm design tec... train " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "complete_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.42783505154639173\n", + "0.1917808219178082\n", + "0.8207547169811321\n", + "0.8464912280701754\n", + "0.3160621761658031\n", + "0.24257425742574257\n", + "Tests Passed!\n" + ] + } + ], + "source": [ + "# run test cell\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# test lcs implementation\n", + "# params: complete_df from before, and lcs_norm_word function\n", + "tests.test_lcs(complete_df, lcs_norm_word)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, take a look at a few resultant values for `lcs_norm_word`. Just like before, you should see that higher values correspond to higher levels of plagiarism." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.1917808219178082\n", + "0.8207547169811321\n", + "0.8464912280701754\n", + "0.3160621761658031\n", + "0.24257425742574257\n", + "Original category values: \n", + " [0, 3, 2, 1, 0]\n", + "\n", + "Normalized LCS values: \n", + " [0.1917808219178082, 0.8207547169811321, 0.8464912280701754, 0.3160621761658031, 0.24257425742574257]\n" + ] + } + ], + "source": [ + "# test on your own\n", + "test_indices = range(5) # look at first few files\n", + "\n", + "category_vals = []\n", + "lcs_norm_vals = []\n", + "# iterate through first few docs and calculate LCS\n", + "for i in test_indices:\n", + " category_vals.append(complete_df.loc[i, 'Category'])\n", + " # get texts to compare\n", + " answer_text = complete_df.loc[i, 'Text'] \n", + " task = complete_df.loc[i, 'Task']\n", + " # we know that source texts have Class = -1\n", + " orig_rows = complete_df[(complete_df['Class'] == -1)]\n", + " orig_row = orig_rows[(orig_rows['Task'] == task)]\n", + " source_text = orig_row['Text'].values[0]\n", + " # calculate lcs\n", + " lcs_val = lcs_norm_word(answer_text, source_text)\n", + " lcs_norm_vals.append(lcs_val)\n", + "\n", + "# print out result, does it make sense?\n", + "print('Original category values: \\n', category_vals)\n", + "print()\n", + "print('Normalized LCS values: \\n', lcs_norm_vals)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Create All Features\n", + "\n", + "Now that you've completed the feature calculation functions, it's time to actually create multiple features and decide on which ones to use in your final model! In the below cells, you're provided two helper functions to help you create multiple features and store those in a DataFrame, `features_df`.\n", + "\n", + "### Creating multiple containment features\n", + "\n", + "Your completed `calculate_containment` function will be called in the next cell, which defines the helper function `create_containment_features`. \n", + "\n", + "> This function returns a list of containment features, calculated for a given `n` and for *all* files in a df (assumed to the the `complete_df`).\n", + "\n", + "For our original files, the containment value is set to a special value, -1.\n", + "\n", + "This function gives you the ability to easily create several containment features, of different n-gram lengths, for each of our text files." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# Function returns a list of containment features, calculated for a given n \n", + "# Should return a list of length 100 for all files in a complete_df\n", + "def create_containment_features(df, n, column_name=None):\n", + " \n", + " containment_values = []\n", + " \n", + " if(column_name==None):\n", + " column_name = 'c_'+str(n) # c_1, c_2, .. c_n\n", + " \n", + " # iterates through dataframe rows\n", + " for i in df.index:\n", + " file = df.loc[i, 'File']\n", + " # Computes features using calculate_containment function\n", + " if df.loc[i,'Category'] > -1:\n", + " c = calculate_containment(df, n, file)\n", + " containment_values.append(c)\n", + " # Sets value to -1 for original tasks \n", + " else:\n", + " containment_values.append(-1)\n", + " \n", + " print(str(n)+'-gram containment features created!')\n", + " return containment_values\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating LCS features\n", + "\n", + "Below, your complete `lcs_norm_word` function is used to create a list of LCS features for all the answer files in a given DataFrame (again, this assumes you are passing in the `complete_df`. It assigns a special value for our original, source files, -1.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# Function creates lcs feature and add it to the dataframe\n", + "def create_lcs_features(df, column_name='lcs_word'):\n", + " \n", + " lcs_values = []\n", + " \n", + " # iterate through files in dataframe\n", + " for i in df.index:\n", + " # Computes LCS_norm words feature using function above for answer tasks\n", + " if df.loc[i,'Category'] > -1:\n", + " # get texts to compare\n", + " answer_text = df.loc[i, 'Text'] \n", + " task = df.loc[i, 'Task']\n", + " # we know that source texts have Class = -1\n", + " orig_rows = df[(df['Class'] == -1)]\n", + " orig_row = orig_rows[(orig_rows['Task'] == task)]\n", + " source_text = orig_row['Text'].values[0]\n", + "\n", + " # calculate lcs\n", + " lcs = lcs_norm_word(answer_text, source_text)\n", + " lcs_values.append(lcs)\n", + " # Sets to -1 for original tasks \n", + " else:\n", + " lcs_values.append(-1)\n", + "\n", + " print('LCS features created!')\n", + " return lcs_values\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Create a features DataFrame by selecting an `ngram_range`\n", + "\n", + "The paper suggests calculating the following features: containment *1-gram to 5-gram* and *longest common subsequence*. \n", + "> In this exercise, you can choose to create even more features, for example from *1-gram to 7-gram* containment features and *longest common subsequence*. \n", + "\n", + "You'll want to create at least 6 features to choose from as you think about which to give to your final, classification model. Defining and comparing at least 6 different features allows you to discard any features that seem redundant, and choose to use the best features for your final model!\n", + "\n", + "In the below cell **define an n-gram range**; these will be the n's you use to create n-gram containment features. The rest of the feature creation code is provided." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1-gram containment features created!\n", + "2-gram containment features created!\n", + "3-gram containment features created!\n", + "4-gram containment features created!\n", + "5-gram containment features created!\n", + "6-gram containment features created!\n", + "0.1917808219178082\n", + "0.8207547169811321\n", + "0.8464912280701754\n", + "0.3160621761658031\n", + "0.24257425742574257\n", + "0.16117216117216118\n", + "0.30165289256198347\n", + "0.6217105263157895\n", + "0.484304932735426\n", + "0.597457627118644\n", + "0.42783505154639173\n", + "0.2708333333333333\n", + "0.22395833333333334\n", + "0.9\n", + "0.8940397350993378\n", + "0.8232044198895028\n", + "0.775\n", + "0.45977011494252873\n", + "0.3055555555555556\n", + "0.2826086956521739\n", + "0.9930555555555556\n", + "0.7888888888888889\n", + "0.3246753246753247\n", + "0.3466666666666667\n", + "1.0\n", + "0.18932038834951456\n", + "0.36893203883495146\n", + "0.4166666666666667\n", + "0.4898785425101215\n", + "0.24742268041237114\n", + "0.21875\n", + "0.29441624365482233\n", + "0.5163934426229508\n", + "0.4725274725274725\n", + "0.6064516129032258\n", + "0.536697247706422\n", + "0.39436619718309857\n", + "0.25833333333333336\n", + "0.2789115646258503\n", + "0.3431372549019608\n", + "0.15302491103202848\n", + "0.4559386973180077\n", + "0.82\n", + "0.45\n", + "0.22935779816513763\n", + "0.16535433070866143\n", + "0.26046511627906976\n", + "0.3415841584158416\n", + "0.9294117647058824\n", + "1.0\n", + "0.6699029126213593\n", + "0.3551912568306011\n", + "0.23376623376623376\n", + "0.3492647058823529\n", + "0.3476190476190476\n", + "0.5677233429394812\n", + "0.774390243902439\n", + "0.19298245614035087\n", + "0.21818181818181817\n", + "0.26666666666666666\n", + "0.22110552763819097\n", + "0.5047169811320755\n", + "0.5585585585585585\n", + "0.9966996699669967\n", + "0.2289156626506024\n", + "0.1722488038277512\n", + "0.23684210526315788\n", + "0.29493087557603687\n", + "0.5037593984962406\n", + "0.9117647058823529\n", + "0.9923076923076923\n", + "0.2833333333333333\n", + "0.2616822429906542\n", + "0.6470588235294118\n", + "0.85\n", + "0.178743961352657\n", + "0.2350230414746544\n", + "0.6619718309859155\n", + "0.7911111111111111\n", + "0.9298245614035088\n", + "0.8546712802768166\n", + "0.2983425414364641\n", + "0.2230769230769231\n", + "0.9270833333333334\n", + "0.9098039215686274\n", + "0.4900990099009901\n", + "0.25203252032520324\n", + "0.1774193548387097\n", + "0.22767857142857142\n", + "0.6437246963562753\n", + "0.24271844660194175\n", + "0.8395061728395061\n", + "0.2830188679245283\n", + "0.16176470588235295\n", + "0.24583333333333332\n", + "LCS features created!\n", + "\n", + "Features: ['c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'lcs_word']\n", + "\n" + ] + } + ], + "source": [ + "# Define an ngram range\n", + "ngram_range = range(1,7)\n", + "\n", + "\n", + "# The following code may take a minute to run, depending on your ngram_range\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "features_list = []\n", + "\n", + "# Create features in a features_df\n", + "all_features = np.zeros((len(ngram_range)+1, len(complete_df)))\n", + "\n", + "# Calculate features for containment for ngrams in range\n", + "i=0\n", + "for n in ngram_range:\n", + " column_name = 'c_'+str(n)\n", + " features_list.append(column_name)\n", + " # create containment features\n", + " all_features[i]=np.squeeze(create_containment_features(complete_df, n))\n", + " i+=1\n", + "\n", + "# Calculate features for LCS_Norm Words \n", + "features_list.append('lcs_word')\n", + "all_features[i]= np.squeeze(create_lcs_features(complete_df))\n", + "\n", + "# create a features dataframe\n", + "features_df = pd.DataFrame(np.transpose(all_features), columns=features_list)\n", + "\n", + "# Print all features/columns\n", + "print()\n", + "print('Features: ', features_list)\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
c_1c_2c_3c_4c_5c_6lcs_word
00.3981480.0790700.0093460.0000000.0000000.0000000.191781
11.0000000.9846940.9641030.9432990.9222800.9010420.820755
20.8693690.7194570.6136360.5159820.4495410.3824880.846491
30.5935830.2688170.1567570.1086960.0819670.0604400.316062
40.5445030.1157890.0317460.0053190.0000000.0000000.242574
50.3295020.0538460.0077220.0038760.0000000.0000000.161172
60.5903080.1504420.0355560.0044640.0000000.0000000.301653
70.7653060.7098980.6643840.6254300.5896550.5536330.621711
80.7597770.5056180.3954800.3068180.2457140.1954020.484305
90.8844440.5267860.3408070.2477480.1809950.1500000.597458
\n", + "
" + ], + "text/plain": [ + " c_1 c_2 c_3 c_4 c_5 c_6 lcs_word\n", + "0 0.398148 0.079070 0.009346 0.000000 0.000000 0.000000 0.191781\n", + "1 1.000000 0.984694 0.964103 0.943299 0.922280 0.901042 0.820755\n", + "2 0.869369 0.719457 0.613636 0.515982 0.449541 0.382488 0.846491\n", + "3 0.593583 0.268817 0.156757 0.108696 0.081967 0.060440 0.316062\n", + "4 0.544503 0.115789 0.031746 0.005319 0.000000 0.000000 0.242574\n", + "5 0.329502 0.053846 0.007722 0.003876 0.000000 0.000000 0.161172\n", + "6 0.590308 0.150442 0.035556 0.004464 0.000000 0.000000 0.301653\n", + "7 0.765306 0.709898 0.664384 0.625430 0.589655 0.553633 0.621711\n", + "8 0.759777 0.505618 0.395480 0.306818 0.245714 0.195402 0.484305\n", + "9 0.884444 0.526786 0.340807 0.247748 0.180995 0.150000 0.597458" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# print some results \n", + "features_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Correlated Features\n", + "\n", + "You should use feature correlation across the *entire* dataset to determine which features are ***too*** **highly-correlated** with each other to include both features in a single model. For this analysis, you can use the *entire* dataset due to the small sample size we have. \n", + "\n", + "All of our features try to measure the similarity between two texts. Since our features are designed to measure similarity, it is expected that these features will be highly-correlated. Many classification models, for example a Naive Bayes classifier, rely on the assumption that features are *not* highly correlated; highly-correlated features may over-inflate the importance of a single feature. \n", + "\n", + "So, you'll want to choose your features based on which pairings have the lowest correlation. These correlation values range between 0 and 1; from low to high correlation, and are displayed in a [correlation matrix](https://www.displayr.com/what-is-a-correlation-matrix/), below." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
c_1c_2c_3c_4c_5c_6lcs_word
c_11.000.940.900.890.880.870.97
c_20.941.000.990.980.970.960.98
c_30.900.991.001.000.990.980.97
c_40.890.981.001.001.000.990.95
c_50.880.970.991.001.001.000.95
c_60.870.960.980.991.001.000.94
lcs_word0.970.980.970.950.950.941.00
\n", + "
" + ], + "text/plain": [ + " c_1 c_2 c_3 c_4 c_5 c_6 lcs_word\n", + "c_1 1.00 0.94 0.90 0.89 0.88 0.87 0.97\n", + "c_2 0.94 1.00 0.99 0.98 0.97 0.96 0.98\n", + "c_3 0.90 0.99 1.00 1.00 0.99 0.98 0.97\n", + "c_4 0.89 0.98 1.00 1.00 1.00 0.99 0.95\n", + "c_5 0.88 0.97 0.99 1.00 1.00 1.00 0.95\n", + "c_6 0.87 0.96 0.98 0.99 1.00 1.00 0.94\n", + "lcs_word 0.97 0.98 0.97 0.95 0.95 0.94 1.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# Create correlation matrix for just Features to determine different models to test\n", + "corr_matrix = features_df.corr().abs().round(2)\n", + "\n", + "# display shows all of a dataframe\n", + "display(corr_matrix)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Create selected train/test data\n", + "\n", + "Complete the `train_test_data` function below. This function should take in the following parameters:\n", + "* `complete_df`: A DataFrame that contains all of our processed text data, file info, datatypes, and class labels\n", + "* `features_df`: A DataFrame of all calculated features, such as containment for ngrams, n= 1-5, and lcs values for each text file listed in the `complete_df` (this was created in the above cells)\n", + "* `selected_features`: A list of feature column names, ex. `['c_1', 'lcs_word']`, which will be used to select the final features in creating train/test sets of data.\n", + "\n", + "It should return two tuples:\n", + "* `(train_x, train_y)`, selected training features and their corresponding class labels (0/1)\n", + "* `(test_x, test_y)`, selected training features and their corresponding class labels (0/1)\n", + "\n", + "** Note: x and y should be arrays of feature values and numerical class labels, respectively; not DataFrames.**\n", + "\n", + "Looking at the above correlation matrix, you should decide on a **cutoff** correlation value, less than 1.0, to determine which sets of features are *too* highly-correlated to be included in the final training and test data. If you cannot find features that are less correlated than some cutoff value, it is suggested that you increase the number of features (longer n-grams) to choose from or use *only one or two* features in your final model to avoid introducing highly-correlated features.\n", + "\n", + "Recall that the `complete_df` has a `Datatype` column that indicates whether data should be `train` or `test` data; this should help you split the data appropriately." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Takes in dataframes and a list of selected features (column names) \n", + "# and returns (train_x, train_y), (test_x, test_y)\n", + "def train_test_data(complete_df, features_df, selected_features):\n", + " '''Gets selected training and test features from given dataframes, and \n", + " returns tuples for training and test features and their corresponding class labels.\n", + " :param complete_df: A dataframe with all of our processed text data, datatypes, and labels\n", + " :param features_df: A dataframe of all computed, similarity features\n", + " :param selected_features: An array of selected features that correspond to certain columns in `features_df`\n", + " :return: training and test features and labels: (train_x, train_y), (test_x, test_y)'''\n", + " \n", + " merged_df = complete_df.merge(features_df, left_index=True, right_index=True)\n", + " \n", + " # get the training features\n", + " train_x = merged_df.loc[merged_df.Datatype == 'train', selected_features].values\n", + " # And training class labels (0 or 1)\n", + " train_y = merged_df.loc[merged_df.Datatype == 'train', 'Class'].values\n", + " \n", + " # get the test features and labels\n", + " test_x = merged_df.loc[merged_df.Datatype == 'test', selected_features].values\n", + " test_y = merged_df.loc[merged_df.Datatype == 'test', 'Class'].values\n", + " \n", + " return (train_x, train_y), (test_x, test_y)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cells\n", + "\n", + "Below, test out your implementation and create the final train/test data." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tests Passed!\n" + ] + } + ], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "test_selection = list(features_df)[:2] # first couple columns as a test\n", + "# test that the correct train/test data is created\n", + "(train_x, train_y), (test_x, test_y) = train_test_data(complete_df, features_df, test_selection)\n", + "\n", + "# params: generated train/test data\n", + "tests.test_data_split(train_x, train_y, test_x, test_y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Select \"good\" features\n", + "\n", + "If you passed the test above, you can create your own train/test data, below. \n", + "\n", + "Define a list of features you'd like to include in your final mode, `selected_features`; this is a list of the features names you want to include." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training size: 70\n", + "Test size: 25\n", + "\n", + "Training df sample: \n", + " [[0.39814815 0. 0.19178082]\n", + " [0.86936937 0.44954128 0.84649123]\n", + " [0.59358289 0.08196721 0.31606218]\n", + " [0.54450262 0. 0.24257426]\n", + " [0.32950192 0. 0.16117216]\n", + " [0.59030837 0. 0.30165289]\n", + " [0.75977654 0.24571429 0.48430493]\n", + " [0.51612903 0. 0.27083333]\n", + " [0.44086022 0. 0.22395833]\n", + " [0.97945205 0.78873239 0.9 ]]\n" + ] + } + ], + "source": [ + "# Select your list of features, this should be column names from features_df\n", + "# ex. ['c_1', 'lcs_word']\n", + "selected_features = ['c_1', 'c_5', 'lcs_word']\n", + "\n", + "\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "\n", + "(train_x, train_y), (test_x, test_y) = train_test_data(complete_df, features_df, selected_features)\n", + "\n", + "# check that division of samples seems correct\n", + "# these should add up to 95 (100 - 5 original files)\n", + "print('Training size: ', len(train_x))\n", + "print('Test size: ', len(test_x))\n", + "print()\n", + "print('Training df sample: \\n', train_x[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question 2: How did you decide on which features to include in your final model? " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Answer:** We decide features based on uniqueness and their correlation to other features. \n", + "For example, in this model, c_1 and lcs_word represent single words common and longest common subsequence respectively, which are totally unique and hence must be selected.\n", + "Now we see c_2, c_3, c_4, c_5 are highly correlated to one another. So selecting one of them should do the work. We finally settle on selecting c_5 as it is least correlated to c_1 and lcs_word, and hence more unique than other features.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Creating Final Data Files\n", + "\n", + "Now, you are almost ready to move on to training a model in SageMaker!\n", + "\n", + "You'll want to access your train and test data in SageMaker and upload it to S3. In this project, SageMaker will expect the following format for your train/test data:\n", + "* Training and test data should be saved in one `.csv` file each, ex `train.csv` and `test.csv`\n", + "* These files should have class labels in the first column and features in the rest of the columns\n", + "\n", + "This format follows the practice, outlined in the [SageMaker documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html), which reads: \"Amazon SageMaker requires that a CSV file doesn't have a header record and that the target variable [class label] is in the first column.\"\n", + "\n", + "## EXERCISE: Create csv files\n", + "\n", + "Define a function that takes in x (features) and y (labels) and saves them to one `.csv` file at the path `data_dir/filename`.\n", + "\n", + "It may be useful to use pandas to merge your features and labels into one DataFrame and then convert that into a csv file. You can make sure to get rid of any incomplete rows, in a DataFrame, by using `dropna`." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def make_csv(x, y, filename, data_dir):\n", + " '''Merges features and labels and converts them into one csv file with labels in the first column.\n", + " :param x: Data features\n", + " :param y: Data labels\n", + " :param file_name: Name of csv file, ex. 'train.csv'\n", + " :param data_dir: The directory where files will be saved\n", + " '''\n", + " # make data dir, if it does not exist\n", + " if not os.path.exists(data_dir):\n", + " os.makedirs(data_dir)\n", + " \n", + " # your code here\n", + " df = pd.concat([pd.DataFrame(y), pd.DataFrame(x)], axis=1).dropna()\n", + " df.to_csv(os.path.join(data_dir, filename), header=False, index=False)\n", + " \n", + " # nothing is returned, but a print statement indicates that the function has run\n", + " print('Path created: '+str(data_dir)+'/'+str(filename))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cells\n", + "\n", + "Test that your code produces the correct format for a `.csv` file, given some text features and labels." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path created: test_csv/to_delete.csv\n", + "Tests passed!\n" + ] + } + ], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "fake_x = [ [0.39814815, 0.0001, 0.19178082], \n", + " [0.86936937, 0.44954128, 0.84649123], \n", + " [0.44086022, 0., 0.22395833] ]\n", + "\n", + "fake_y = [0, 1, 1]\n", + "\n", + "make_csv(fake_x, fake_y, filename='to_delete.csv', data_dir='test_csv')\n", + "\n", + "# read in and test dimensions\n", + "fake_df = pd.read_csv('test_csv/to_delete.csv', header=None)\n", + "\n", + "# check shape\n", + "assert fake_df.shape==(3, 4), \\\n", + " 'The file should have as many rows as data_points and as many columns as features+1 (for indices).'\n", + "# check that first column = labels\n", + "assert np.all(fake_df.iloc[:,0].values==fake_y), 'First column is not equal to the labels, fake_y.'\n", + "print('Tests passed!')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# delete the test csv file, generated above\n", + "! rm -rf test_csv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you've passed the tests above, run the following cell to create `train.csv` and `test.csv` files in a directory that you specify! This will save the data in a local directory. Remember the name of this directory because you will reference it again when uploading this data to S3." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path created: plagiarism_data/train.csv\n", + "Path created: plagiarism_data/test.csv\n" + ] + } + ], + "source": [ + "# can change directory, if you want\n", + "data_dir = 'plagiarism_data'\n", + "\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "\n", + "make_csv(train_x, train_y, filename='train.csv', data_dir=data_dir)\n", + "make_csv(test_x, test_y, filename='test.csv', data_dir=data_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Up Next\n", + "\n", + "Now that you've done some feature engineering and created some training and test data, you are ready to train and deploy a plagiarism classification model. The next notebook will utilize SageMaker resources to train and test a model that you design." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_amazonei_mxnet_p36", + "language": "python", + "name": "conda_amazonei_mxnet_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/.ipynb_checkpoints/3_Training_a_Model-checkpoint.ipynb b/.ipynb_checkpoints/3_Training_a_Model-checkpoint.ipynb new file mode 100644 index 0000000..769f066 --- /dev/null +++ b/.ipynb_checkpoints/3_Training_a_Model-checkpoint.ipynb @@ -0,0 +1,789 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plagiarism Detection Model\n", + "\n", + "Now that you've created training and test data, you are ready to define and train a model. Your goal in this notebook, will be to train a binary classification model that learns to label an answer file as either plagiarized or not, based on the features you provide the model.\n", + "\n", + "This task will be broken down into a few discrete steps:\n", + "\n", + "* Upload your data to S3.\n", + "* Define a binary classification model and a training script.\n", + "* Train your model and deploy it.\n", + "* Evaluate your deployed classifier and answer some questions about your approach.\n", + "\n", + "To complete this notebook, you'll have to complete all given exercises and answer all the questions in this notebook.\n", + "> All your tasks will be clearly labeled **EXERCISE** and questions as **QUESTION**.\n", + "\n", + "It will be up to you to explore different classification models and decide on a model that gives you the best performance for this dataset.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data to S3\n", + "\n", + "In the last notebook, you should have created two files: a `training.csv` and `test.csv` file with the features and class labels for the given corpus of plagiarized/non-plagiarized text data. \n", + "\n", + ">The below cells load in some AWS SageMaker libraries and creates a default bucket. After creating this bucket, you can upload your locally stored data to S3.\n", + "\n", + "Save your train and test `.csv` feature files, locally. To do this you can run the second notebook \"2_Plagiarism_Feature_Engineering\" in SageMaker or you can manually upload your files to this notebook using the upload icon in Jupyter Lab. Then you can upload local files to S3 by using `sagemaker_session.upload_data` and pointing directly to where the training data is saved." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import boto3\n", + "import sagemaker" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# session and role\n", + "sagemaker_session = sagemaker.Session()\n", + "role = sagemaker.get_execution_role()\n", + "\n", + "# create an S3 bucket\n", + "bucket = sagemaker_session.default_bucket()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Upload your training data to S3\n", + "\n", + "Specify the `data_dir` where you've saved your `train.csv` file. Decide on a descriptive `prefix` that defines where your data will be uploaded in the default S3 bucket. Finally, create a pointer to your training data by calling `sagemaker_session.upload_data` and passing in the required parameters. It may help to look at the [Session documentation](https://sagemaker.readthedocs.io/en/stable/session.html#sagemaker.session.Session.upload_data) or previous SageMaker code examples.\n", + "\n", + "You are expected to upload your entire directory. Later, the training script will only access the `train.csv` file." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# should be the name of directory you created to save your features data\n", + "data_dir = 'plagiarism_data'\n", + "\n", + "# set prefix, a descriptive name for a directory \n", + "prefix = 'sagemaker/plagiarism_detector'\n", + "\n", + "# upload all data to S3\n", + "input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cell\n", + "\n", + "Test that your data has been successfully uploaded. The below cell prints out the items in your S3 bucket and will throw an error if it is empty. You should see the contents of your `data_dir` and perhaps some checkpoints. If you see any other files listed, then you may have some old model files that you can delete via the S3 console (though, additional files shouldn't affect the performance of model developed in this notebook)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker-scikit-learn-2020-01-07-09-01-41-326/debug-output/training_job_end.ts\n", + "sagemaker-scikit-learn-2020-01-07-09-01-41-326/output/model.tar.gz\n", + "sagemaker-scikit-learn-2020-01-07-09-01-41-326/source/sourcedir.tar.gz\n", + "sagemaker/plagiarism_detector/test.csv\n", + "sagemaker/plagiarism_detector/train.csv\n", + "Test passed!\n" + ] + } + ], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# confirm that data is in S3 bucket\n", + "empty_check = []\n", + "for obj in boto3.resource('s3').Bucket(bucket).objects.all():\n", + " empty_check.append(obj.key)\n", + " print(obj.key)\n", + "\n", + "assert len(empty_check) !=0, 'S3 bucket is empty.'\n", + "print('Test passed!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Modeling\n", + "\n", + "Now that you've uploaded your training data, it's time to define and train a model!\n", + "\n", + "The type of model you create is up to you. For a binary classification task, you can choose to go one of three routes:\n", + "* Use a built-in classification algorithm, like LinearLearner.\n", + "* Define a custom Scikit-learn classifier, a comparison of models can be found [here](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html).\n", + "* Define a custom PyTorch neural network classifier. \n", + "\n", + "It will be up to you to test out a variety of models and choose the best one. Your project will be graded on the accuracy of your final model. \n", + " \n", + "---\n", + "\n", + "## EXERCISE: Complete a training script \n", + "\n", + "To implement a custom classifier, you'll need to complete a `train.py` script. You've been given the folders `source_sklearn` and `source_pytorch` which hold starting code for a custom Scikit-learn model and a PyTorch model, respectively. Each directory has a `train.py` training script. To complete this project **you only need to complete one of these scripts**; the script that is responsible for training your final model.\n", + "\n", + "A typical training script:\n", + "* Loads training data from a specified directory\n", + "* Parses any training & model hyperparameters (ex. nodes in a neural network, training epochs, etc.)\n", + "* Instantiates a model of your design, with any specified hyperparams\n", + "* Trains that model \n", + "* Finally, saves the model so that it can be hosted/deployed, later\n", + "\n", + "### Defining and training a model\n", + "Much of the training script code is provided for you. Almost all of your work will be done in the `if __name__ == '__main__':` section. To complete a `train.py` file, you will:\n", + "1. Import any extra libraries you need\n", + "2. Define any additional model training hyperparameters using `parser.add_argument`\n", + "2. Define a model in the `if __name__ == '__main__':` section\n", + "3. Train the model in that same section\n", + "\n", + "Below, you can use `!pygmentize` to display an existing `train.py` file. Read through the code; all of your tasks are marked with `TODO` comments. \n", + "\n", + "**Note: If you choose to create a custom PyTorch model, you will be responsible for defining the model in the `model.py` file,** and a `predict.py` file is provided. If you choose to use Scikit-learn, you only need a `train.py` file; you may import a classifier from the `sklearn` library." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36m__future__\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m print_function\r\n", + "\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36margparse\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mos\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mpandas\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mpd\u001b[39;49;00m\r\n", + "\r\n", + "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36msklearn.externals\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m joblib\r\n", + "\r\n", + "\u001b[37m## TODO: Import any additional libraries you need to define a model\u001b[39;49;00m\r\n", + "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36msklearn.linear_model\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m LogisticRegression\r\n", + "\r\n", + "\u001b[37m# Provided model load function\u001b[39;49;00m\r\n", + "\u001b[34mdef\u001b[39;49;00m \u001b[32mmodel_fn\u001b[39;49;00m(model_dir):\r\n", + " \u001b[33m\"\"\"Load model from the model_dir. This is the same model that is saved\u001b[39;49;00m\r\n", + "\u001b[33m in the main if statement.\u001b[39;49;00m\r\n", + "\u001b[33m \"\"\"\u001b[39;49;00m\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m\"\u001b[39;49;00m\u001b[33mLoading model.\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + " \r\n", + " \u001b[37m# load using joblib\u001b[39;49;00m\r\n", + " model = joblib.load(os.path.join(model_dir, \u001b[33m\"\u001b[39;49;00m\u001b[33mmodel.joblib\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m))\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m\"\u001b[39;49;00m\u001b[33mDone loading model.\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + " \r\n", + " \u001b[34mreturn\u001b[39;49;00m model\r\n", + "\r\n", + "\r\n", + "\u001b[37m## TODO: Complete the main code\u001b[39;49;00m\r\n", + "\u001b[34mif\u001b[39;49;00m \u001b[31m__name__\u001b[39;49;00m == \u001b[33m'\u001b[39;49;00m\u001b[33m__main__\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m:\r\n", + " \r\n", + " \u001b[37m# All of the model parameters and training parameters are sent as arguments\u001b[39;49;00m\r\n", + " \u001b[37m# when this script is executed, during a training job\u001b[39;49;00m\r\n", + " \r\n", + " \u001b[37m# Here we set up an argument parser to easily access the parameters\u001b[39;49;00m\r\n", + " parser = argparse.ArgumentParser()\r\n", + "\r\n", + " \u001b[37m# SageMaker parameters, like the directories for training data and saving models; set automatically\u001b[39;49;00m\r\n", + " \u001b[37m# Do not need to change\u001b[39;49;00m\r\n", + " parser.add_argument(\u001b[33m'\u001b[39;49;00m\u001b[33m--output-data-dir\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mstr\u001b[39;49;00m, default=os.environ[\u001b[33m'\u001b[39;49;00m\u001b[33mSM_OUTPUT_DATA_DIR\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m])\r\n", + " parser.add_argument(\u001b[33m'\u001b[39;49;00m\u001b[33m--model-dir\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mstr\u001b[39;49;00m, default=os.environ[\u001b[33m'\u001b[39;49;00m\u001b[33mSM_MODEL_DIR\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m])\r\n", + " parser.add_argument(\u001b[33m'\u001b[39;49;00m\u001b[33m--data-dir\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mstr\u001b[39;49;00m, default=os.environ[\u001b[33m'\u001b[39;49;00m\u001b[33mSM_CHANNEL_TRAIN\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m])\r\n", + " \r\n", + " \u001b[37m## TODO: Add any additional arguments that you will need to pass into your model\u001b[39;49;00m\r\n", + " \r\n", + " \u001b[37m# args holds all passed-in arguments\u001b[39;49;00m\r\n", + " args = parser.parse_args()\r\n", + "\r\n", + " \u001b[37m# Read in csv training file\u001b[39;49;00m\r\n", + " training_dir = args.data_dir\r\n", + " train_data = pd.read_csv(os.path.join(training_dir, \u001b[33m\"\u001b[39;49;00m\u001b[33mtrain.csv\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m), header=\u001b[36mNone\u001b[39;49;00m, names=\u001b[36mNone\u001b[39;49;00m)\r\n", + "\r\n", + " \u001b[37m# Labels are in the first column\u001b[39;49;00m\r\n", + " train_y = train_data.iloc[:,\u001b[34m0\u001b[39;49;00m]\r\n", + " train_x = train_data.iloc[:,\u001b[34m1\u001b[39;49;00m:]\r\n", + " \r\n", + " \r\n", + " \u001b[37m## --- Your code here --- ##\u001b[39;49;00m\r\n", + " \r\n", + "\r\n", + " \u001b[37m## TODO: Define a model \u001b[39;49;00m\r\n", + " model = LogisticRegression()\r\n", + " \r\n", + " \r\n", + " \u001b[37m## TODO: Train the model\u001b[39;49;00m\r\n", + " model.fit(train_x, train_y)\r\n", + " \r\n", + " \r\n", + " \u001b[37m## --- End of your code --- ##\u001b[39;49;00m\r\n", + " \r\n", + "\r\n", + " \u001b[37m# Save the trained model\u001b[39;49;00m\r\n", + " joblib.dump(model, os.path.join(args.model_dir, \u001b[33m\"\u001b[39;49;00m\u001b[33mmodel.joblib\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m))\r\n" + ] + } + ], + "source": [ + "# directory can be changed to: source_sklearn or source_pytorch\n", + "!pygmentize source_sklearn/train.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Provided code\n", + "\n", + "If you read the code above, you can see that the starter code includes a few things:\n", + "* Model loading (`model_fn`) and saving code\n", + "* Getting SageMaker's default hyperparameters\n", + "* Loading the training data by name, `train.csv` and extracting the features and labels, `train_x`, and `train_y`\n", + "\n", + "If you'd like to read more about model saving with [joblib for sklearn](https://scikit-learn.org/stable/modules/model_persistence.html) or with [torch.save](https://pytorch.org/tutorials/beginner/saving_loading_models.html), click on the provided links." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Create an Estimator\n", + "\n", + "When a custom model is constructed in SageMaker, an entry point must be specified. This is the Python file which will be executed when the model is trained; the `train.py` function you specified above. To run a custom training script in SageMaker, construct an estimator, and fill in the appropriate constructor arguments:\n", + "\n", + "* **entry_point**: The path to the Python script SageMaker runs for training and prediction.\n", + "* **source_dir**: The path to the training script directory `source_sklearn` OR `source_pytorch`.\n", + "* **entry_point**: The path to the Python script SageMaker runs for training and prediction.\n", + "* **source_dir**: The path to the training script directory `train_sklearn` OR `train_pytorch`.\n", + "* **entry_point**: The path to the Python script SageMaker runs for training.\n", + "* **source_dir**: The path to the training script directory `train_sklearn` OR `train_pytorch`.\n", + "* **role**: Role ARN, which was specified, above.\n", + "* **train_instance_count**: The number of training instances (should be left at 1).\n", + "* **train_instance_type**: The type of SageMaker instance for training. Note: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.\n", + "* **sagemaker_session**: The session used to train on Sagemaker.\n", + "* **hyperparameters** (optional): A dictionary `{'name':value, ..}` passed to the train function as hyperparameters.\n", + "\n", + "Note: For a PyTorch model, there is another optional argument **framework_version**, which you can set to the latest version of PyTorch, `1.0`.\n", + "\n", + "## EXERCISE: Define a Scikit-learn or PyTorch estimator\n", + "\n", + "To import your desired estimator, use one of the following lines:\n", + "```\n", + "from sagemaker.sklearn.estimator import SKLearn\n", + "```\n", + "```\n", + "from sagemaker.pytorch import PyTorch\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# your import and estimator code, here\n", + "from sagemaker.sklearn.estimator import SKLearn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Train the estimator\n", + "\n", + "Train your estimator on the training data stored in S3. This should create a training job that you can monitor in your SageMaker console." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 100 µs, sys: 8 µs, total: 108 µs\n", + "Wall time: 113 µs\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# Train your estimator on S3 training data\n", + "estimator = SKLearn(role=role,\n", + " sagemaker_session=sagemaker_session,\n", + " train_instance_count=1,\n", + " train_instance_type='ml.m4.xlarge',\n", + " entry_point='train.py',\n", + " source_dir='source_sklearn'\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-01-07 09:09:49 Starting - Starting the training job...\n", + "2020-01-07 09:09:52 Starting - Launching requested ML instances......\n", + "2020-01-07 09:11:00 Starting - Preparing the instances for training......\n", + "2020-01-07 09:11:59 Downloading - Downloading input data...\n", + "2020-01-07 09:12:45 Training - Training image download completed. Training in progress..\u001b[34m2020-01-07 09:12:46,565 sagemaker-containers INFO Imported framework sagemaker_sklearn_container.training\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,567 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,579 sagemaker_sklearn_container.training INFO Invoking user training script.\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,879 sagemaker-containers INFO Module train does not provide a setup.py. \u001b[0m\n", + "\u001b[34mGenerating setup.py\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,879 sagemaker-containers INFO Generating setup.cfg\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,879 sagemaker-containers INFO Generating MANIFEST.in\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,879 sagemaker-containers INFO Installing module with the following command:\u001b[0m\n", + "\u001b[34m/miniconda3/bin/python -m pip install . \u001b[0m\n", + "\u001b[34mProcessing /opt/ml/code\u001b[0m\n", + "\u001b[34mBuilding wheels for collected packages: train\n", + " Building wheel for train (setup.py): started\n", + " Building wheel for train (setup.py): finished with status 'done'\n", + " Created wheel for train: filename=train-1.0.0-py2.py3-none-any.whl size=5830 sha256=89f0f979d7c997c9fa98f05ad318aa496237540267e8f3bbaa7891a289b94c0c\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-tkej929b/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3\u001b[0m\n", + "\u001b[34mSuccessfully built train\u001b[0m\n", + "\u001b[34mInstalling collected packages: train\u001b[0m\n", + "\u001b[34mSuccessfully installed train-1.0.0\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:48,410 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:48,423 sagemaker-containers INFO Invoking user script\n", + "\u001b[0m\n", + "\u001b[34mTraining Env:\n", + "\u001b[0m\n", + "\u001b[34m{\n", + " \"additional_framework_parameters\": {},\n", + " \"channel_input_dirs\": {\n", + " \"train\": \"/opt/ml/input/data/train\"\n", + " },\n", + " \"current_host\": \"algo-1\",\n", + " \"framework_module\": \"sagemaker_sklearn_container.training:main\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"hyperparameters\": {},\n", + " \"input_config_dir\": \"/opt/ml/input/config\",\n", + " \"input_data_config\": {\n", + " \"train\": {\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " }\n", + " },\n", + " \"input_dir\": \"/opt/ml/input\",\n", + " \"is_master\": true,\n", + " \"job_name\": \"sagemaker-scikit-learn-2020-01-07-09-09-49-538\",\n", + " \"log_level\": 20,\n", + " \"master_hostname\": \"algo-1\",\n", + " \"model_dir\": \"/opt/ml/model\",\n", + " \"module_dir\": \"s3://sagemaker-us-east-1-309164732448/sagemaker-scikit-learn-2020-01-07-09-09-49-538/source/sourcedir.tar.gz\",\n", + " \"module_name\": \"train\",\n", + " \"network_interface_name\": \"eth0\",\n", + " \"num_cpus\": 4,\n", + " \"num_gpus\": 0,\n", + " \"output_data_dir\": \"/opt/ml/output/data\",\n", + " \"output_dir\": \"/opt/ml/output\",\n", + " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", + " \"resource_config\": {\n", + " \"current_host\": \"algo-1\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"network_interface_name\": \"eth0\"\n", + " },\n", + " \"user_entry_point\": \"train.py\"\u001b[0m\n", + "\u001b[34m}\n", + "\u001b[0m\n", + "\u001b[34mEnvironment variables:\n", + "\u001b[0m\n", + "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", + "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", + "\u001b[34mSM_HPS={}\u001b[0m\n", + "\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", + "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", + "\u001b[34mSM_INPUT_DATA_CONFIG={\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", + "\u001b[34mSM_CHANNELS=[\"train\"]\u001b[0m\n", + "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", + "\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", + "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_sklearn_container.training:main\u001b[0m\n", + "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", + "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", + "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", + "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", + "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", + "\u001b[34mSM_MODULE_DIR=s3://sagemaker-us-east-1-309164732448/sagemaker-scikit-learn-2020-01-07-09-09-49-538/source/sourcedir.tar.gz\u001b[0m\n", + "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"train\":\"/opt/ml/input/data/train\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_sklearn_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"sagemaker-scikit-learn-2020-01-07-09-09-49-538\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-us-east-1-309164732448/sagemaker-scikit-learn-2020-01-07-09-09-49-538/source/sourcedir.tar.gz\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", + "\u001b[34mSM_USER_ARGS=[]\u001b[0m\n", + "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", + "\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", + "\u001b[34mPYTHONPATH=/miniconda3/bin:/miniconda3/lib/python37.zip:/miniconda3/lib/python3.7:/miniconda3/lib/python3.7/lib-dynload:/miniconda3/lib/python3.7/site-packages\n", + "\u001b[0m\n", + "\u001b[34mInvoking script with the following command:\n", + "\u001b[0m\n", + "\u001b[34m/miniconda3/bin/python -m train\n", + "\n", + "\u001b[0m\n", + "\u001b[34m/miniconda3/lib/python3.7/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py:47: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", + " import imp\u001b[0m\n", + "\u001b[34m/miniconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:49,705 sagemaker-containers INFO Reporting training SUCCESS\u001b[0m\n", + "\n", + "2020-01-07 09:13:10 Uploading - Uploading generated training model\n", + "2020-01-07 09:13:10 Completed - Training job completed\n", + "Training seconds: 71\n", + "Billable seconds: 71\n" + ] + } + ], + "source": [ + "estimator.fit({'train': input_data})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Deploy the trained model\n", + "\n", + "After training, deploy your model to create a `predictor`. If you're using a PyTorch model, you'll need to create a trained `PyTorchModel` that accepts the trained `.model_data` as an input parameter and points to the provided `source_pytorch/predict.py` file as an entry point. \n", + "\n", + "To deploy a trained model, you'll use `.deploy`, which takes in two arguments:\n", + "* **initial_instance_count**: The number of deployed instances (1).\n", + "* **instance_type**: The type of SageMaker instance for deployment.\n", + "\n", + "Note: If you run into an instance error, it may be because you chose the wrong training or deployment instance_type. It may help to refer to your previous exercise code to see which types of instances we used." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------------------------------------------------------------!CPU times: user 520 ms, sys: 19.4 ms, total: 540 ms\n", + "Wall time: 8min 20s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# uncomment, if needed\n", + "# from sagemaker.pytorch import PyTorchModel\n", + "\n", + "\n", + "# deploy your model to create a predictor\n", + "predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Evaluating Your Model\n", + "\n", + "Once your model is deployed, you can see how it performs when applied to our test data.\n", + "\n", + "The provided cell below, reads in the test data, assuming it is stored locally in `data_dir` and named `test.csv`. The labels and features are extracted from the `.csv` file." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "import os\n", + "\n", + "# read in test data, assuming it is stored locally\n", + "test_data = pd.read_csv(os.path.join(data_dir, \"test.csv\"), header=None, names=None)\n", + "\n", + "# labels are in the first column\n", + "test_y = test_data.iloc[:,0]\n", + "test_x = test_data.iloc[:,1:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Determine the accuracy of your model\n", + "\n", + "Use your deployed `predictor` to generate predicted, class labels for the test data. Compare those to the *true* labels, `test_y`, and calculate the accuracy as a value between 0 and 1.0 that indicates the fraction of test data that your model classified correctly. You may use [sklearn.metrics](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics) for this calculation.\n", + "\n", + "**To pass this project, your model should get at least 90% test accuracy.**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test passed!\n" + ] + } + ], + "source": [ + "# First: generate predicted, class labels\n", + "test_y_preds = predictor.predict(test_x)\n", + "\n", + "\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# test that your model generates the correct number of labels\n", + "assert len(test_y_preds)==len(test_y), 'Unexpected number of predictions.'\n", + "print('Test passed!')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recall: 1.0666666666666667\n", + "Precision: 1.0\n", + "Accuracy: 0.96\n", + "\n", + "Predicted class labels: \n", + "[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 0 0]\n", + "\n", + "True class labels: \n", + "[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0]\n" + ] + } + ], + "source": [ + "# Second: calculate the test accuracy\n", + "accuracy_array = test_y_preds == test_y\n", + "count = 0\n", + "for element in accuracy_array:\n", + " if element == True:\n", + " count = count + 1\n", + "\n", + "false_positives = test_y_preds - accuracy_array\n", + "false_positive_count = false_positives.where(false_positives > 0, 0 )\n", + "accuracy = count/ len(accuracy_array)\n", + " \n", + "recall = test_y_preds.sum() / test_y.sum()\n", + "print('Recall: ', recall)\n", + "precision = test_y_preds.sum() / (test_y_preds.sum() + 0)\n", + "print('Precision: ', precision)\n", + "\n", + "print('Accuracy:', accuracy)\n", + "\n", + "\n", + "## print out the array of predicted and true labels, if you want\n", + "print('\\nPredicted class labels: ')\n", + "print(test_y_preds)\n", + "print('\\nTrue class labels: ')\n", + "print(test_y.values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question 1: How many false positives and false negatives did your model produce, if any? And why do you think this is?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Answer**: My model didn't produce any false positives, and only 1 false negative. With an astounding accuracy of 94%, I think it happened because the dataset is small and hence we don't have many outliers.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question 2: How did you decide on the type of model to use? " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Answer**: The problem being a binary classification problem, with multiple numerical features, Linear Logistic Regression is one of the best models. It accepts n features and returns classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "## EXERCISE: Clean up Resources\n", + "\n", + "After you're done evaluating your model, **delete your model endpoint**. You can do this with a call to `.delete_endpoint()`. You need to show, in this notebook, that the endpoint was deleted. Any other resources, you may delete from the AWS console, and you will find more instructions on cleaning up all your resources, below." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment and fill in the line below!\n", + "predictor.delete_endpoint()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Deleting S3 bucket\n", + "\n", + "When you are *completely* done with training and testing models, you can also delete your entire S3 bucket. If you do this before you are done training your model, you'll have to recreate your S3 bucket and upload your training data again." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'ResponseMetadata': {'RequestId': 'FF46362CF1FCE014',\n", + " 'HostId': 'CpqWvNa0rhrelGsFOAYQFukvUQ8xvAhiB9EB24l472MTvPhr1ykHsAX/Gr8a+lSIjlL1Jfoz9fg=',\n", + " 'HTTPStatusCode': 200,\n", + " 'HTTPHeaders': {'x-amz-id-2': 'CpqWvNa0rhrelGsFOAYQFukvUQ8xvAhiB9EB24l472MTvPhr1ykHsAX/Gr8a+lSIjlL1Jfoz9fg=',\n", + " 'x-amz-request-id': 'FF46362CF1FCE014',\n", + " 'date': 'Tue, 07 Jan 2020 09:21:53 GMT',\n", + " 'connection': 'close',\n", + " 'content-type': 'application/xml',\n", + " 'transfer-encoding': 'chunked',\n", + " 'server': 'AmazonS3'},\n", + " 'RetryAttempts': 0},\n", + " 'Deleted': [{'Key': 'sagemaker-scikit-learn-2020-01-07-09-01-41-326/output/model.tar.gz'},\n", + " {'Key': 'sagemaker/plagiarism_detector/test.csv'},\n", + " {'Key': 'sagemaker/plagiarism_detector/train.csv'},\n", + " {'Key': 'sagemaker-scikit-learn-2020-01-07-09-09-49-538/debug-output/training_job_end.ts'},\n", + " {'Key': 'sagemaker-scikit-learn-2020-01-07-09-09-49-538/output/model.tar.gz'},\n", + " {'Key': 'sagemaker-scikit-learn-2020-01-07-09-09-49-538/source/sourcedir.tar.gz'},\n", + " {'Key': 'sagemaker-scikit-learn-2020-01-07-09-01-41-326/source/sourcedir.tar.gz'},\n", + " {'Key': 'sagemaker-scikit-learn-2020-01-07-09-01-41-326/debug-output/training_job_end.ts'}]}]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#deleting bucket, uncomment lines below\n", + "\n", + "bucket_to_delete = boto3.resource('s3').Bucket(bucket)\n", + "bucket_to_delete.objects.all().delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Deleting all your models and instances\n", + "\n", + "When you are _completely_ done with this project and do **not** ever want to revisit this notebook, you can choose to delete all of your SageMaker notebook instances and models by following [these instructions](https://docs.aws.amazon.com/sagemaker/latest/dg/ex1-cleanup.html). Before you delete this notebook instance, I recommend at least downloading a copy and saving it, locally." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Further Directions\n", + "\n", + "There are many ways to improve or add on to this project to expand your learning or make this more of a unique project for you. A few ideas are listed below:\n", + "* Train a classifier to predict the *category* (1-3) of plagiarism and not just plagiarized (1) or not (0).\n", + "* Utilize a different and larger dataset to see if this model can be extended to other types of plagiarism.\n", + "* Use language or character-level analysis to find different (and more) similarity features.\n", + "* Write a complete pipeline function that accepts a source text and submitted text file, and classifies the submitted text as plagiarized or not.\n", + "* Use API Gateway and a lambda function to deploy your model to a web application.\n", + "\n", + "These are all just options for extending your work. If you've completed all the exercises in this notebook, you've completed a real-world application, and can proceed to submit your project. Great job!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_pytorch_p36", + "language": "python", + "name": "conda_pytorch_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/1_Data_Exploration.ipynb b/1_Data_Exploration.ipynb new file mode 100644 index 0000000..154d7e1 --- /dev/null +++ b/1_Data_Exploration.ipynb @@ -0,0 +1,746 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plagiarism Text Data\n", + "\n", + "In this project, you will be tasked with building a plagiarism detector that examines a text file and performs binary classification; labeling that file as either plagiarized or not, depending on how similar the text file is when compared to a provided source text. \n", + "\n", + "The first step in working with any dataset is loading the data in and noting what information is included in the dataset. This is an important step in eventually working with this data, and knowing what kinds of features you have to work with as you transform and group the data!\n", + "\n", + "So, this notebook is all about exploring the data and noting patterns about the features you are given and the distribution of data. \n", + "\n", + "> There are not any exercises or questions in this notebook, it is only meant for exploration. This notebook will note be required in your final project submission.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read in the Data\n", + "\n", + "The cell below will download the necessary data and extract the files into the folder `data/`.\n", + "\n", + "This data is a slightly modified version of a dataset created by Paul Clough (Information Studies) and Mark Stevenson (Computer Science), at the University of Sheffield. You can read all about the data collection and corpus, at [their university webpage](https://ir.shef.ac.uk/cloughie/resources/plagiarism_corpus.html). \n", + "\n", + "> **Citation for data**: Clough, P. and Stevenson, M. Developing A Corpus of Plagiarised Short Answers, Language Resources and Evaluation: Special Issue on Plagiarism and Authorship Analysis, In Press. [Download]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "!wget https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c4147f9_data/data.zip\n", + "!unzip data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This plagiarism dataset is made of multiple text files; each of these files has characteristics that are is summarized in a `.csv` file named `file_information.csv`, which we can read in using `pandas`." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategory
0g0pA_taska.txtanon
1g0pA_taskb.txtbcut
2g0pA_taskc.txtclight
3g0pA_taskd.txtdheavy
4g0pA_taske.txtenon
5g0pB_taska.txtanon
6g0pB_taskb.txtbnon
7g0pB_taskc.txtccut
8g0pB_taskd.txtdlight
9g0pB_taske.txteheavy
\n", + "
" + ], + "text/plain": [ + " File Task Category\n", + "0 g0pA_taska.txt a non\n", + "1 g0pA_taskb.txt b cut\n", + "2 g0pA_taskc.txt c light\n", + "3 g0pA_taskd.txt d heavy\n", + "4 g0pA_taske.txt e non\n", + "5 g0pB_taska.txt a non\n", + "6 g0pB_taskb.txt b non\n", + "7 g0pB_taskc.txt c cut\n", + "8 g0pB_taskd.txt d light\n", + "9 g0pB_taske.txt e heavy" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_file = 'data/file_information.csv'\n", + "plagiarism_df = pd.read_csv(csv_file)\n", + "\n", + "# print out the first few rows of data info\n", + "plagiarism_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Types of Plagiarism\n", + "\n", + "Each text file is associated with one **Task** (task A-E) and one **Category** of plagiarism, which you can see in the above DataFrame.\n", + "\n", + "### Five task types, A-E\n", + "\n", + "Each text file contains an answer to one short question; these questions are labeled as tasks A-E.\n", + "* Each task, A-E, is about a topic that might be included in the Computer Science curriculum that was created by the authors of this dataset. \n", + " * For example, Task A asks the question: \"What is inheritance in object oriented programming?\"\n", + "\n", + "### Four categories of plagiarism \n", + "\n", + "Each text file has an associated plagiarism label/category:\n", + "\n", + "1. `cut`: An answer is plagiarized; it is copy-pasted directly from the relevant Wikipedia source text.\n", + "2. `light`: An answer is plagiarized; it is based on the Wikipedia source text and includes some copying and paraphrasing.\n", + "3. `heavy`: An answer is plagiarized; it is based on the Wikipedia source text but expressed using different words and structure. Since this doesn't copy directly from a source text, this will likely be the most challenging kind of plagiarism to detect.\n", + "4. `non`: An answer is not plagiarized; the Wikipedia source text is not used to create this answer.\n", + "5. `orig`: This is a specific category for the original, Wikipedia source text. We will use these files only for comparison purposes.\n", + "\n", + "> So, out of the submitted files, the only category that does not contain any plagiarism is `non`.\n", + "\n", + "In the next cell, print out some statistics about the data." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of files: 100\n", + "Number of unique tasks/question types (A-E): 5\n", + "Unique plagiarism categories: ['non' 'cut' 'light' 'heavy' 'orig']\n" + ] + } + ], + "source": [ + "# print out some stats about the data\n", + "print('Number of files: ', plagiarism_df.shape[0]) # .shape[0] gives the rows \n", + "# .unique() gives unique items in a specified column\n", + "print('Number of unique tasks/question types (A-E): ', (len(plagiarism_df['Task'].unique())))\n", + "print('Unique plagiarism categories: ', (plagiarism_df['Category'].unique()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You should see the number of text files in the dataset as well as some characteristics about the `Task` and `Category` columns. **Note that the file count of 100 *includes* the 5 _original_ wikipedia files for tasks A-E.** If you take a look at the files in the `data` directory, you'll notice that the original, source texts start with the filename `orig_` as opposed to `g` for \"group.\" \n", + "\n", + "> So, in total there are 100 files, 95 of which are answers (submitted by people) and 5 of which are the original, Wikipedia source texts.\n", + "\n", + "Your end goal will be to use this information to classify any given answer text into one of two categories, plagiarized or not-plagiarized." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distribution of Data\n", + "\n", + "Next, let's look at the distribution of data. In this course, we've talked about traits like class imbalance that can inform how you develop an algorithm. So, here, we'll ask: **How evenly is our data distributed among different tasks and plagiarism levels?**\n", + "\n", + "Below, you should notice two things:\n", + "* Our dataset is quite small, especially with respect to examples of varying plagiarism levels.\n", + "* The data is distributed fairly evenly across task and plagiarism types." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Task:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TaskCounts
0a20
1b20
2c20
3d20
4e20
\n", + "
" + ], + "text/plain": [ + " Task Counts\n", + "0 a 20\n", + "1 b 20\n", + "2 c 20\n", + "3 d 20\n", + "4 e 20" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Plagiarism Levels:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CategoryCounts
0cut19
1heavy19
2light19
3non38
4orig5
\n", + "
" + ], + "text/plain": [ + " Category Counts\n", + "0 cut 19\n", + "1 heavy 19\n", + "2 light 19\n", + "3 non 38\n", + "4 orig 5" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Task & Plagiarism Level Combos :\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TaskCategoryCounts
0acut4
1aheavy3
2alight3
3anon9
4aorig1
5bcut3
6bheavy4
7blight3
8bnon9
9borig1
10ccut3
11cheavy5
12clight4
13cnon7
14corig1
15dcut4
16dheavy4
17dlight5
18dnon6
19dorig1
20ecut5
21eheavy3
22elight4
23enon7
24eorig1
\n", + "
" + ], + "text/plain": [ + " Task Category Counts\n", + "0 a cut 4\n", + "1 a heavy 3\n", + "2 a light 3\n", + "3 a non 9\n", + "4 a orig 1\n", + "5 b cut 3\n", + "6 b heavy 4\n", + "7 b light 3\n", + "8 b non 9\n", + "9 b orig 1\n", + "10 c cut 3\n", + "11 c heavy 5\n", + "12 c light 4\n", + "13 c non 7\n", + "14 c orig 1\n", + "15 d cut 4\n", + "16 d heavy 4\n", + "17 d light 5\n", + "18 d non 6\n", + "19 d orig 1\n", + "20 e cut 5\n", + "21 e heavy 3\n", + "22 e light 4\n", + "23 e non 7\n", + "24 e orig 1" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Show counts by different tasks and amounts of plagiarism\n", + "\n", + "# group and count by task\n", + "counts_per_task=plagiarism_df.groupby(['Task']).size().reset_index(name=\"Counts\")\n", + "print(\"\\nTask:\")\n", + "display(counts_per_task)\n", + "\n", + "# group by plagiarism level\n", + "counts_per_category=plagiarism_df.groupby(['Category']).size().reset_index(name=\"Counts\")\n", + "print(\"\\nPlagiarism Levels:\")\n", + "display(counts_per_category)\n", + "\n", + "# group by task AND plagiarism level\n", + "counts_task_and_plagiarism=plagiarism_df.groupby(['Task', 'Category']).size().reset_index(name=\"Counts\")\n", + "print(\"\\nTask & Plagiarism Level Combos :\")\n", + "display(counts_task_and_plagiarism)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It may also be helpful to look at this last DataFrame, graphically.\n", + "\n", + "Below, you can see that the counts follow a pattern broken down by task. Each task has one source text (original) and the highest number on `non` plagiarized cases." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAEyCAYAAAC/Lwo5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAADCFJREFUeJzt3V+MpXddx/HP1w5EW4jWdEKwf1w0xoRwIWRiVAhpQI2iEU0MgQQD3qwXosWYKHoDNybGIMELQ7ICBmOFmFKVGKKQCFFvGnZLI21XlGD5UwtdQiLUm4r9ejGHuK67O2fa53xnz9nXK9nszJnnnPnOb57Je5/nnHm2ujsAwOZ9y0kPAADXC9EFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAkL1NPOgtt9zSp06d2sRDA8A159y5c1/p7v2jtttIdE+dOpWzZ89u4qEB4JpTVZ9bZzunlwFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhG7n2Mv9f1fHv0738HNvOOgLbzJEuAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhqwV3ar6tap6qKoerKr3V9W3bnowANg1R0a3qm5N8qtJDrr7RUluSPLaTQ8GALtm3dPLe0m+rar2ktyY5N83NxIA7KYjo9vdjyZ5e5LPJ3ksyX9090c2PRgA7Jp1Ti/fnOTVSV6Q5LuS3FRVr7/Mdqer6mxVnb1w4cLykwLAllvn9PKPJvm37r7Q3f+V5N4kP3LpRt19prsPuvtgf39/6TkBYOutE93PJ/mhqrqxqirJK5Oc3+xYALB71nlO974k9yS5P8mnVvc5s+G5AGDn7K2zUXe/NclbNzwLAOw0V6QCgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWDI3kkPAMyrOv59upefA663fdGRLgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMWSu6VfUdVXVPVf1zVZ2vqh/e9GAAsGv21tzuD5L8TXf/fFU9O8mNG5wJAHbSkdGtqm9P8vIkb0yS7n4yyZObHQsAds86p5dfkORCkj+uqk9W1bur6qZLN6qq01V1tqrOXrhwYfFBAa4lVU/vD9e3daK7l+QlSd7V3S9O8p9J3nLpRt19prsPuvtgf39/4TEBYPutE90vJvlid9+3ev+eHEYYADiGI6Pb3V9K8oWq+v7VTa9M8vBGpwKAHbTuq5d/Jcndq1cufzbJL25uJADYTWtFt7sfSHKw4VkAYKe5IhUADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAEP2TnoAuN5UHf8+3cvPcb3bhe/DLnwN1xtHugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABiydnSr6oaq+mRV/fUmBwKAXXWcI927kpzf1CAAsOvWim5V3Zbkp5K8e7PjAMDuWvdI951JfiPJUxucBQB22t5RG1TVTyd5vLvPVdWdV9nudJLTSXLHHXcsNuDhYx//Pt2LjkB8H64Vu/B9WOJr2IV12AW+D8ezzpHuS5P8TFU9kuQDSV5RVX966Ubdfaa7D7r7YH9/f+ExAWD7HRnd7v6t7r6tu08leW2Sv+vu1298MgDYMX5PFwCGHPmc7sW6++NJPr6RSQBgxznSBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwJC9kx5gQtXx79O9/GOctJP+Gp7O5196hiWc9DrCkuzPsxzpAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYIjoAsAQ0QWAIaILAENEFwCGiC4ADBFdABgiugAwRHQBYMiR0a2q26vqY1X1cFU9VFV3TQwGALtmb41tvpHk17v7/qp6bpJzVfXR7n54w7MBwE458ki3ux/r7vtXb389yfkkt256MADYNcd6TreqTiV5cZL7LvOx01V1tqrOXrhwYZnpAGCHrB3dqnpOkg8meXN3f+3Sj3f3me4+6O6D/f39JWcEgJ2wVnSr6lk5DO7d3X3vZkcCgN20zquXK8l7kpzv7ndsfiQA2E3rHOm+NMkvJHlFVT2w+vOqDc8FADvnyF8Z6u5/TFIDswDATnNFKgAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhoguAAwRXQAYIroAMER0AWCI6ALAENEFgCGiCwBDRBcAhuyd9ACsp+r49+lefo5tZx2XYR2XYR2XsU3r6EgXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAMEV0AGCK6ADBEdAFgiOgCwBDRBYAhogsAQ9aKblX9RFV9uqo+U1Vv2fRQALCLjoxuVd2Q5A+T/GSSFyZ5XVW9cNODAcCuWedI9weTfKa7P9vdTyb5QJJXb3YsANg960T31iRfuOj9L65uAwCOYW+pB6qq00lOr959oqo+vdRjX8UtSb5y+Xme2QM/0/tfCzMc4/7WcZn7X7PruGXfh8uu45Z9DdfCDDu7jsNfwxV/ri/x3es82DrRfTTJ7Re9f9vqtv+ju88kObPOJ11KVZ3t7oPJz7mLrOMyrOMyrOMyrOMyll7HdU4vfyLJ91XVC6rq2Ulem+RDSw0AANeLI490u/sbVfWmJH+b5IYk7+3uhzY+GQDsmLWe0+3uDyf58IZneTpGT2fvMOu4DOu4DOu4DOu4jEXXsbp7yccDAK7AZSABYIjoAsCQrY2u60Evo6oeqapPVdUDVXX2pOfZFlX13qp6vKoevOi276yqj1bVv67+vvkkZ9wGV1jHt1XVo6t98oGqetVJznitq6rbq+pjVfVwVT1UVXetbrc/HsNV1nHR/XErn9NdXQ/6X5L8WA6vkPWJJK/r7odPdLAtVFWPJDno7nV++ZuVqnp5kieS/El3v2h12+8l+Wp3/+7qH4I3d/dvnuSc17orrOPbkjzR3W8/ydm2RVU9P8nzu/v+qnpuknNJfjbJG2N/XNtV1vE1WXB/3NYjXdeD5kR1998n+eolN786yftWb78vhz+wXMUV1pFj6O7Huvv+1dtfT3I+h5fqtT8ew1XWcVHbGl3Xg15OJ/lIVZ1bXcqTp+953f3Y6u0vJXneSQ6z5d5UVf+0Ov3stOiaqupUkhcnuS/2x6ftknVMFtwftzW6LOdl3f2SHP7Xjb+8Ot3HM9SHz9ts33M314Z3JfneJD+Q5LEkv3+y42yHqnpOkg8meXN3f+3ij9kf13eZdVx0f9zW6K51PWiO1t2Prv5+PMlf5PDUPU/Pl1fPC33z+aHHT3ierdTdX+7u/+7up5L8UeyTR6qqZ+UwFHd3972rm+2Px3S5dVx6f9zW6Loe9AKq6qbVCwZSVTcl+fEkD179XlzFh5K8YfX2G5L81QnOsrW+GYqVn4t98qqqqpK8J8n57n7HRR+yPx7DldZx6f1xK1+9nCSrl22/M/97PejfOeGRtk5VfU8Oj26Tw0uC/pl1XE9VvT/JnTn8b7++nOStSf4yyZ8nuSPJ55K8pru9SOgqrrCOd+bwVF4neSTJL1303CSXqKqXJfmHJJ9K8tTq5t/O4fOR9sc1XWUdX5cF98etjS4AbJttPb0MAFtHdAFgiOgCwBDRBYAhogsAQ0QXAIaILgAM+R8ehKbWpEhdRgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "% matplotlib inline\n", + "\n", + "# counts\n", + "group = ['Task', 'Category']\n", + "counts = plagiarism_df.groupby(group).size().reset_index(name=\"Counts\")\n", + "\n", + "plt.figure(figsize=(8,5))\n", + "plt.bar(range(len(counts)), counts['Counts'], color = 'blue')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Up Next\n", + "\n", + "This notebook is just about data loading and exploration, and you do not need to include it in your final project submission. \n", + "\n", + "In the next few notebooks, you'll use this data to train a complete plagiarism classifier. You'll be tasked with extracting meaningful features from the text data, reading in answers to different tasks and comparing them to the original Wikipedia source text. You'll engineer similarity features that will help identify cases of plagiarism. Then, you'll use these features to train and deploy a classification model in a SageMaker notebook instance. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_amazonei_mxnet_p36", + "language": "python", + "name": "conda_amazonei_mxnet_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/2_Plagiarism_Feature_Engineering.ipynb b/2_Plagiarism_Feature_Engineering.ipynb new file mode 100644 index 0000000..9a32e37 --- /dev/null +++ b/2_Plagiarism_Feature_Engineering.ipynb @@ -0,0 +1,2362 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plagiarism Detection, Feature Engineering\n", + "\n", + "In this project, you will be tasked with building a plagiarism detector that examines an answer text file and performs binary classification; labeling that file as either plagiarized or not, depending on how similar that text file is to a provided, source text. \n", + "\n", + "Your first task will be to create some features that can then be used to train a classification model. This task will be broken down into a few discrete steps:\n", + "\n", + "* Clean and pre-process the data.\n", + "* Define features for comparing the similarity of an answer text and a source text, and extract similarity features.\n", + "* Select \"good\" features, by analyzing the correlations between different features.\n", + "* Create train/test `.csv` files that hold the relevant features and class labels for train/test data points.\n", + "\n", + "In the _next_ notebook, Notebook 3, you'll use the features and `.csv` files you create in _this_ notebook to train a binary classification model in a SageMaker notebook instance.\n", + "\n", + "You'll be defining a few different similarity features, as outlined in [this paper](https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c412841_developing-a-corpus-of-plagiarised-short-answers/developing-a-corpus-of-plagiarised-short-answers.pdf), which should help you build a robust plagiarism detector!\n", + "\n", + "To complete this notebook, you'll have to complete all given exercises and answer all the questions in this notebook.\n", + "> All your tasks will be clearly labeled **EXERCISE** and questions as **QUESTION**.\n", + "\n", + "It will be up to you to decide on the features to include in your final training and test data.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read in the Data\n", + "\n", + "The cell below will download the necessary, project data and extract the files into the folder `data/`.\n", + "\n", + "This data is a slightly modified version of a dataset created by Paul Clough (Information Studies) and Mark Stevenson (Computer Science), at the University of Sheffield. You can read all about the data collection and corpus, at [their university webpage](https://ir.shef.ac.uk/cloughie/resources/plagiarism_corpus.html). \n", + "\n", + "> **Citation for data**: Clough, P. and Stevenson, M. Developing A Corpus of Plagiarised Short Answers, Language Resources and Evaluation: Special Issue on Plagiarism and Authorship Analysis, In Press. [Download]" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE:\n", + "# you only need to run this cell if you have not yet downloaded the data\n", + "# otherwise you may skip this cell or comment it out\n", + "\n", + "#!wget https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c4147f9_data/data.zip\n", + "#!unzip data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# import libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This plagiarism dataset is made of multiple text files; each of these files has characteristics that are is summarized in a `.csv` file named `file_information.csv`, which we can read in using `pandas`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategory
0g0pA_taska.txtanon
1g0pA_taskb.txtbcut
2g0pA_taskc.txtclight
3g0pA_taskd.txtdheavy
4g0pA_taske.txtenon
\n", + "
" + ], + "text/plain": [ + " File Task Category\n", + "0 g0pA_taska.txt a non\n", + "1 g0pA_taskb.txt b cut\n", + "2 g0pA_taskc.txt c light\n", + "3 g0pA_taskd.txt d heavy\n", + "4 g0pA_taske.txt e non" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "csv_file = 'data/file_information.csv'\n", + "plagiarism_df = pd.read_csv(csv_file)\n", + "\n", + "# print out the first few rows of data info\n", + "plagiarism_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Types of Plagiarism\n", + "\n", + "Each text file is associated with one **Task** (task A-E) and one **Category** of plagiarism, which you can see in the above DataFrame.\n", + "\n", + "### Tasks, A-E\n", + "\n", + "Each text file contains an answer to one short question; these questions are labeled as tasks A-E. For example, Task A asks the question: \"What is inheritance in object oriented programming?\"\n", + "\n", + "### Categories of plagiarism \n", + "\n", + "Each text file has an associated plagiarism label/category:\n", + "\n", + "**1. Plagiarized categories: `cut`, `light`, and `heavy`.**\n", + "* These categories represent different levels of plagiarized answer texts. `cut` answers copy directly from a source text, `light` answers are based on the source text but include some light rephrasing, and `heavy` answers are based on the source text, but *heavily* rephrased (and will likely be the most challenging kind of plagiarism to detect).\n", + " \n", + "**2. Non-plagiarized category: `non`.** \n", + "* `non` indicates that an answer is not plagiarized; the Wikipedia source text is not used to create this answer.\n", + " \n", + "**3. Special, source text category: `orig`.**\n", + "* This is a specific category for the original, Wikipedia source text. We will use these files only for comparison purposes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Pre-Process the Data\n", + "\n", + "In the next few cells, you'll be tasked with creating a new DataFrame of desired information about all of the files in the `data/` directory. This will prepare the data for feature extraction and for training a binary, plagiarism classifier." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EXERCISE: Convert categorical to numerical data\n", + "\n", + "You'll notice that the `Category` column in the data, contains string or categorical values, and to prepare these for feature extraction, we'll want to convert these into numerical values. Additionally, our goal is to create a binary classifier and so we'll need a binary class label that indicates whether an answer text is plagiarized (1) or not (0). Complete the below function `numerical_dataframe` that reads in a `file_information.csv` file by name, and returns a *new* DataFrame with a numerical `Category` column and a new `Class` column that labels each answer as plagiarized or not. \n", + "\n", + "Your function should return a new DataFrame with the following properties:\n", + "\n", + "* 4 columns: `File`, `Task`, `Category`, `Class`. The `File` and `Task` columns can remain unchanged from the original `.csv` file.\n", + "* Convert all `Category` labels to numerical labels according to the following rules (a higher value indicates a higher degree of plagiarism):\n", + " * 0 = `non`\n", + " * 1 = `heavy`\n", + " * 2 = `light`\n", + " * 3 = `cut`\n", + " * -1 = `orig`, this is a special value that indicates an original file.\n", + "* For the new `Class` column\n", + " * Any answer text that is not plagiarized (`non`) should have the class label `0`. \n", + " * Any plagiarized answer texts should have the class label `1`. \n", + " * And any `orig` texts will have a special label `-1`. \n", + "\n", + "### Expected output\n", + "\n", + "After running your function, you should get a DataFrame with rows that looks like the following: \n", + "```\n", + "\n", + " File\t Task Category Class\n", + "0\tg0pA_taska.txt\ta\t 0 \t0\n", + "1\tg0pA_taskb.txt\tb\t 3 \t1\n", + "2\tg0pA_taskc.txt\tc\t 2 \t1\n", + "3\tg0pA_taskd.txt\td\t 1 \t1\n", + "4\tg0pA_taske.txt\te\t 0\t 0\n", + "...\n", + "...\n", + "99 orig_taske.txt e -1 -1\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Read in a csv file and return a transformed dataframe\n", + "def numerical_dataframe(csv_file='data/file_information.csv'):\n", + " '''Reads in a csv file which is assumed to have `File`, `Category` and `Task` columns.\n", + " This function does two things: \n", + " 1) converts `Category` column values to numerical values \n", + " 2) Adds a new, numerical `Class` label column.\n", + " The `Class` column will label plagiarized answers as 1 and non-plagiarized as 0.\n", + " Source texts have a special label, -1.\n", + " :param csv_file: The directory for the file_information.csv file\n", + " :return: A dataframe with numerical categories and a new `Class` label column'''\n", + " \n", + " # your code here\n", + " category_to_numerical = {'non': 0, 'heavy': 1, 'light': 2, 'cut': 3, 'orig': -1 }\n", + " numerical_to_class = {'non': 0, 'heavy': 1, 'light': 1, 'cut': 1, 'orig': -1}\n", + " df = pd.read_csv(csv_file)\n", + " class_list = []\n", + " category_list = []\n", + " for index, row in df.iterrows():\n", + " category_list.append(category_to_numerical[row['Category']])\n", + " class_list.append(numerical_to_class[row['Category']])\n", + " df['Category'] = category_list\n", + " df['Class'] = class_list\n", + " return df\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cells\n", + "\n", + "Below are a couple of test cells. The first is an informal test where you can check that your code is working as expected by calling your function and printing out the returned result.\n", + "\n", + "The **second** cell below is a more rigorous test cell. The goal of a cell like this is to ensure that your code is working as expected, and to form any variables that might be used in _later_ tests/code, in this case, the data frame, `transformed_df`.\n", + "\n", + "> The cells in this notebook should be run in chronological order (the order they appear in the notebook). This is especially important for test cells.\n", + "\n", + "Often, later cells rely on the functions, imports, or variables defined in earlier cells. For example, some tests rely on previous tests to work.\n", + "\n", + "These tests do not test all cases, but they are a great way to check that you are on the right track!" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategoryClass
0g0pA_taska.txta00
1g0pA_taskb.txtb31
2g0pA_taskc.txtc21
3g0pA_taskd.txtd11
4g0pA_taske.txte00
5g0pB_taska.txta00
6g0pB_taskb.txtb00
7g0pB_taskc.txtc31
8g0pB_taskd.txtd21
9g0pB_taske.txte11
\n", + "
" + ], + "text/plain": [ + " File Task Category Class\n", + "0 g0pA_taska.txt a 0 0\n", + "1 g0pA_taskb.txt b 3 1\n", + "2 g0pA_taskc.txt c 2 1\n", + "3 g0pA_taskd.txt d 1 1\n", + "4 g0pA_taske.txt e 0 0\n", + "5 g0pB_taska.txt a 0 0\n", + "6 g0pB_taskb.txt b 0 0\n", + "7 g0pB_taskc.txt c 3 1\n", + "8 g0pB_taskd.txt d 2 1\n", + "9 g0pB_taske.txt e 1 1" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# informal testing, print out the results of a called function\n", + "# create new `transformed_df`\n", + "transformed_df = numerical_dataframe(csv_file ='data/file_information.csv')\n", + "\n", + "# check work\n", + "# check that all categories of plagiarism have a class label = 1\n", + "transformed_df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tests Passed!\n", + "\n", + "Example data: \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategoryClass
0g0pA_taska.txta00
1g0pA_taskb.txtb31
2g0pA_taskc.txtc21
3g0pA_taskd.txtd11
4g0pA_taske.txte00
\n", + "
" + ], + "text/plain": [ + " File Task Category Class\n", + "0 g0pA_taska.txt a 0 0\n", + "1 g0pA_taskb.txt b 3 1\n", + "2 g0pA_taskc.txt c 2 1\n", + "3 g0pA_taskd.txt d 1 1\n", + "4 g0pA_taske.txt e 0 0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# test cell that creates `transformed_df`, if tests are passed\n", + "\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "\n", + "# importing tests\n", + "import problem_unittests as tests\n", + "\n", + "# test numerical_dataframe function\n", + "tests.test_numerical_df(numerical_dataframe)\n", + "\n", + "# if above test is passed, create NEW `transformed_df`\n", + "transformed_df = numerical_dataframe(csv_file ='data/file_information.csv')\n", + "\n", + "# check work\n", + "print('\\nExample data: ')\n", + "transformed_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Text Processing & Splitting Data\n", + "\n", + "Recall that the goal of this project is to build a plagiarism classifier. At it's heart, this task is a comparison text; one that looks at a given answer and a source text, compares them and predicts whether an answer has plagiarized from the source. To effectively do this comparison, and train a classifier we'll need to do a few more things: pre-process all of our text data and prepare the text files (in this case, the 95 answer files and 5 original source files) to be easily compared, and split our data into a `train` and `test` set that can be used to train a classifier and evaluate it, respectively. \n", + "\n", + "To this end, you've been provided code that adds additional information to your `transformed_df` from above. The next two cells need not be changed; they add two additional columns to the `transformed_df`:\n", + "\n", + "1. A `Text` column; this holds all the lowercase text for a `File`, with extraneous punctuation removed.\n", + "2. A `Datatype` column; this is a string value `train`, `test`, or `orig` that labels a data point as part of our train or test set\n", + "\n", + "The details of how these additional columns are created can be found in the `helpers.py` file in the project directory. You're encouraged to read through that file to see exactly how text is processed and how data is split.\n", + "\n", + "Run the cells below to get a `complete_df` that has all the information you need to proceed with plagiarism detection and feature engineering." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategoryClassText
0g0pA_taska.txta00inheritance is a basic concept of object orien...
1g0pA_taskb.txtb31pagerank is a link analysis algorithm used by ...
2g0pA_taskc.txtc21the vector space model also called term vector...
3g0pA_taskd.txtd11bayes theorem was names after rev thomas bayes...
4g0pA_taske.txte00dynamic programming is an algorithm design tec...
\n", + "
" + ], + "text/plain": [ + " File Task Category Class \\\n", + "0 g0pA_taska.txt a 0 0 \n", + "1 g0pA_taskb.txt b 3 1 \n", + "2 g0pA_taskc.txt c 2 1 \n", + "3 g0pA_taskd.txt d 1 1 \n", + "4 g0pA_taske.txt e 0 0 \n", + "\n", + " Text \n", + "0 inheritance is a basic concept of object orien... \n", + "1 pagerank is a link analysis algorithm used by ... \n", + "2 the vector space model also called term vector... \n", + "3 bayes theorem was names after rev thomas bayes... \n", + "4 dynamic programming is an algorithm design tec... " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "import helpers \n", + "\n", + "# create a text column \n", + "text_df = helpers.create_text_column(transformed_df)\n", + "text_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sample processed text:\n", + "\n", + " inheritance is a basic concept of object oriented programming where the basic idea is to create new classes that add extra detail to existing classes this is done by allowing the new classes to reuse the methods and variables of the existing classes and new methods and classes are added to specialise the new class inheritance models the is kind of relationship between entities or objects for example postgraduates and undergraduates are both kinds of student this kind of relationship can be visualised as a tree structure where student would be the more general root node and both postgraduate and undergraduate would be more specialised extensions of the student node or the child nodes in this relationship student would be known as the superclass or parent class whereas postgraduate would be known as the subclass or child class because the postgraduate class extends the student class inheritance can occur on several layers where if visualised would display a larger tree structure for example we could further extend the postgraduate node by adding two extra extended classes to it called msc student and phd student as both these types of student are kinds of postgraduate student this would mean that both the msc student and phd student classes would inherit methods and variables from both the postgraduate and student classes \n" + ] + } + ], + "source": [ + "# after running the cell above\n", + "# check out the processed text for a single file, by row index\n", + "row_idx = 0 # feel free to change this index\n", + "\n", + "sample_text = text_df.iloc[0]['Text']\n", + "\n", + "print('Sample processed text:\\n\\n', sample_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Split data into training and test sets\n", + "\n", + "The next cell will add a `Datatype` column to a given DataFrame to indicate if the record is: \n", + "* `train` - Training data, for model training.\n", + "* `test` - Testing data, for model evaluation.\n", + "* `orig` - The task's original answer from wikipedia.\n", + "\n", + "### Stratified sampling\n", + "\n", + "The given code uses a helper function which you can view in the `helpers.py` file in the main project directory. This implements [stratified random sampling](https://en.wikipedia.org/wiki/Stratified_sampling) to randomly split data by task & plagiarism amount. Stratified sampling ensures that we get training and test data that is fairly evenly distributed across task & plagiarism combinations. Approximately 26% of the data is held out for testing and 74% of the data is used for training.\n", + "\n", + "The function **train_test_dataframe** takes in a DataFrame that it assumes has `Task` and `Category` columns, and, returns a modified frame that indicates which `Datatype` (train, test, or orig) a file falls into. This sampling will change slightly based on a passed in *random_seed*. Due to a small sample size, this stratified random sampling will provide more stable results for a binary plagiarism classifier. Stability here is smaller *variance* in the accuracy of classifier, given a random seed." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategoryClassTextDatatype
0g0pA_taska.txta00inheritance is a basic concept of object orien...train
1g0pA_taskb.txtb31pagerank is a link analysis algorithm used by ...test
2g0pA_taskc.txtc21the vector space model also called term vector...train
3g0pA_taskd.txtd11bayes theorem was names after rev thomas bayes...train
4g0pA_taske.txte00dynamic programming is an algorithm design tec...train
5g0pB_taska.txta00inheritance is a basic concept in object orien...train
6g0pB_taskb.txtb00pagerank pr refers to both the concept and the...train
7g0pB_taskc.txtc31vector space model is an algebraic model for r...test
8g0pB_taskd.txtd21bayes theorem relates the conditional and marg...train
9g0pB_taske.txte11dynamic programming is a method for solving ma...test
\n", + "
" + ], + "text/plain": [ + " File Task Category Class \\\n", + "0 g0pA_taska.txt a 0 0 \n", + "1 g0pA_taskb.txt b 3 1 \n", + "2 g0pA_taskc.txt c 2 1 \n", + "3 g0pA_taskd.txt d 1 1 \n", + "4 g0pA_taske.txt e 0 0 \n", + "5 g0pB_taska.txt a 0 0 \n", + "6 g0pB_taskb.txt b 0 0 \n", + "7 g0pB_taskc.txt c 3 1 \n", + "8 g0pB_taskd.txt d 2 1 \n", + "9 g0pB_taske.txt e 1 1 \n", + "\n", + " Text Datatype \n", + "0 inheritance is a basic concept of object orien... train \n", + "1 pagerank is a link analysis algorithm used by ... test \n", + "2 the vector space model also called term vector... train \n", + "3 bayes theorem was names after rev thomas bayes... train \n", + "4 dynamic programming is an algorithm design tec... train \n", + "5 inheritance is a basic concept in object orien... train \n", + "6 pagerank pr refers to both the concept and the... train \n", + "7 vector space model is an algebraic model for r... test \n", + "8 bayes theorem relates the conditional and marg... train \n", + "9 dynamic programming is a method for solving ma... test " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "random_seed = 1 # can change; set for reproducibility\n", + "\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "import helpers\n", + "\n", + "# create new df with Datatype (train, test, orig) column\n", + "# pass in `text_df` from above to create a complete dataframe, with all the information you need\n", + "complete_df = helpers.train_test_dataframe(text_df, random_seed=random_seed)\n", + "\n", + "# check results\n", + "complete_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Determining Plagiarism\n", + "\n", + "Now that you've prepared this data and created a `complete_df` of information, including the text and class associated with each file, you can move on to the task of extracting similarity features that will be useful for plagiarism classification. \n", + "\n", + "> Note: The following code exercises, assume that the `complete_df` as it exists now, will **not** have its existing columns modified. \n", + "\n", + "The `complete_df` should always include the columns: `['File', 'Task', 'Category', 'Class', 'Text', 'Datatype']`. You can add additional columns, and you can create any new DataFrames you need by copying the parts of the `complete_df` as long as you do not modify the existing values, directly.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Similarity Features \n", + "\n", + "One of the ways we might go about detecting plagiarism, is by computing **similarity features** that measure how similar a given answer text is as compared to the original wikipedia source text (for a specific task, a-e). The similarity features you will use are informed by [this paper on plagiarism detection](https://s3.amazonaws.com/video.udacity-data.com/topher/2019/January/5c412841_developing-a-corpus-of-plagiarised-short-answers/developing-a-corpus-of-plagiarised-short-answers.pdf). \n", + "> In this paper, researchers created features called **containment** and **longest common subsequence**. \n", + "\n", + "Using these features as input, you will train a model to distinguish between plagiarized and not-plagiarized text files.\n", + "\n", + "## Feature Engineering\n", + "\n", + "Let's talk a bit more about the features we want to include in a plagiarism detection model and how to calculate such features. In the following explanations, I'll refer to a submitted text file as a **Student Answer Text (A)** and the original, wikipedia source file (that we want to compare that answer to) as the **Wikipedia Source Text (S)**.\n", + "\n", + "### Containment\n", + "\n", + "Your first task will be to create **containment features**. To understand containment, let's first revisit a definition of [n-grams](https://en.wikipedia.org/wiki/N-gram). An *n-gram* is a sequential word grouping. For example, in a line like \"bayes rule gives us a way to combine prior knowledge with new information,\" a 1-gram is just one word, like \"bayes.\" A 2-gram might be \"bayes rule\" and a 3-gram might be \"combine prior knowledge.\"\n", + "\n", + "> Containment is defined as the **intersection** of the n-gram word count of the Wikipedia Source Text (S) with the n-gram word count of the Student Answer Text (S) *divided* by the n-gram word count of the Student Answer Text.\n", + "\n", + "$$ \\frac{\\sum{count(\\text{ngram}_{A}) \\cap count(\\text{ngram}_{S})}}{\\sum{count(\\text{ngram}_{A})}} $$\n", + "\n", + "If the two texts have no n-grams in common, the containment will be 0, but if _all_ their n-grams intersect then the containment will be 1. Intuitively, you can see how having longer n-gram's in common, might be an indication of cut-and-paste plagiarism. In this project, it will be up to you to decide on the appropriate `n` or several `n`'s to use in your final model.\n", + "\n", + "### EXERCISE: Create containment features\n", + "\n", + "Given the `complete_df` that you've created, you should have all the information you need to compare any Student Answer Text (A) with its appropriate Wikipedia Source Text (S). An answer for task A should be compared to the source text for task A, just as answers to tasks B, C, D, and E should be compared to the corresponding original source text.\n", + "\n", + "In this exercise, you'll complete the function, `calculate_containment` which calculates containment based upon the following parameters:\n", + "* A given DataFrame, `df` (which is assumed to be the `complete_df` from above)\n", + "* An `answer_filename`, such as 'g0pB_taskd.txt' \n", + "* An n-gram length, `n`\n", + "\n", + "### Containment calculation\n", + "\n", + "The general steps to complete this function are as follows:\n", + "1. From *all* of the text files in a given `df`, create an array of n-gram counts; it is suggested that you use a [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) for this purpose.\n", + "2. Get the processed answer and source texts for the given `answer_filename`.\n", + "3. Calculate the containment between an answer and source text according to the following equation.\n", + "\n", + " >$$ \\frac{\\sum{count(\\text{ngram}_{A}) \\cap count(\\text{ngram}_{S})}}{\\sum{count(\\text{ngram}_{A})}} $$\n", + " \n", + "4. Return that containment value.\n", + "\n", + "You are encouraged to write any helper functions that you need to complete the function below." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "# Calculate the ngram containment for one answer file/source file pair in a df\n", + "def calculate_containment(df, n, answer_filename):\n", + " '''Calculates the containment between a given answer text and its associated source text.\n", + " This function creates a count of ngrams (of a size, n) for each text file in our data.\n", + " Then calculates the containment by finding the ngram count for a given answer text, \n", + " and its associated source text, and calculating the normalized intersection of those counts.\n", + " :param df: A dataframe with columns,\n", + " 'File', 'Task', 'Category', 'Class', 'Text', and 'Datatype'\n", + " :param n: An integer that defines the ngram size\n", + " :param answer_filename: A filename for an answer text in the df, ex. 'g0pB_taskd.txt'\n", + " :return: A single containment value that represents the similarity\n", + " between an answer text and its source text.\n", + " '''\n", + " \n", + " # your code here\n", + " row = df.loc[df['File'] == answer_filename]\n", + " answer = row['File']\n", + " answer_location = answer.index.item()\n", + " \n", + " source = df[(df['Task'] == df.iloc[answer_location]['Task']) & (df['Category'] == -1)]\n", + " source_location = source.index.item()\n", + " \n", + " counts = CountVectorizer(analyzer='word', ngram_range=(n,n))\n", + " ngrams = counts.fit_transform(df['Text'])\n", + " \n", + " ngram_array = ngrams.toarray()\n", + " answer_and_source = ngram_array[(answer_location, source_location),]\n", + " \n", + " sum_intersection_ngrams = np.sum(np.min(answer_and_source, axis=0))\n", + " containment = sum_intersection_ngrams / np.sum(answer_and_source[0])\n", + " \n", + " return containment\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cells\n", + "\n", + "After you've implemented the containment function, you can test out its behavior. \n", + "\n", + "The cell below iterates through the first few files, and calculates the original category _and_ containment values for a specified n and file.\n", + "\n", + ">If you've implemented this correctly, you should see that the non-plagiarized have low or close to 0 containment values and that plagiarized examples have higher containment values, closer to 1.\n", + "\n", + "Note what happens when you change the value of n. I recommend applying your code to multiple files and comparing the resultant containment values. You should see that the highest containment values correspond to files with the highest category (`cut`) of plagiarism level." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original category values: \n", + " [0, 3, 2, 1, 0]\n", + "\n", + "3-gram containment values: \n", + " [0.009345794392523364, 0.9641025641025641, 0.6136363636363636, 0.15675675675675677, 0.031746031746031744]\n" + ] + } + ], + "source": [ + "# select a value for n\n", + "n = 3\n", + "\n", + "# indices for first few files\n", + "test_indices = range(5)\n", + "\n", + "# iterate through files and calculate containment\n", + "category_vals = []\n", + "containment_vals = []\n", + "for i in test_indices:\n", + " # get level of plagiarism for a given file index\n", + " category_vals.append(complete_df.loc[i, 'Category'])\n", + " # calculate containment for given file and n\n", + " filename = complete_df.loc[i, 'File']\n", + " c = calculate_containment(complete_df, n, filename)\n", + " containment_vals.append(c)\n", + "\n", + "# print out result, does it make sense?\n", + "print('Original category values: \\n', category_vals)\n", + "print()\n", + "print(str(n)+'-gram containment values: \\n', containment_vals)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tests Passed!\n" + ] + } + ], + "source": [ + "# run this test cell\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# test containment calculation\n", + "# params: complete_df from before, and containment function\n", + "tests.test_containment(complete_df, calculate_containment)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### QUESTION 1: Why can we calculate containment features across *all* data (training & test), prior to splitting the DataFrame for modeling? That is, what about the containment calculation means that the test and training data do not influence each other?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Answer:** To make the model as accurate as possible, we train it on both training and test datasets. This also eliminates the condition of missing out important words that might signify plagiarism.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Longest Common Subsequence\n", + "\n", + "Containment a good way to find overlap in word usage between two documents; it may help identify cases of cut-and-paste as well as paraphrased levels of plagiarism. Since plagiarism is a fairly complex task with varying levels, it's often useful to include other measures of similarity. The paper also discusses a feature called **longest common subsequence**.\n", + "\n", + "> The longest common subsequence is the longest string of words (or letters) that are *the same* between the Wikipedia Source Text (S) and the Student Answer Text (A). This value is also normalized by dividing by the total number of words (or letters) in the Student Answer Text. \n", + "\n", + "In this exercise, we'll ask you to calculate the longest common subsequence of words between two texts.\n", + "\n", + "### EXERCISE: Calculate the longest common subsequence\n", + "\n", + "Complete the function `lcs_norm_word`; this should calculate the *longest common subsequence* of words between a Student Answer Text and corresponding Wikipedia Source Text. \n", + "\n", + "It may be helpful to think of this in a concrete example. A Longest Common Subsequence (LCS) problem may look as follows:\n", + "* Given two texts: text A (answer text) of length n, and string S (original source text) of length m. Our goal is to produce their longest common subsequence of words: the longest sequence of words that appear left-to-right in both texts (though the words don't have to be in continuous order).\n", + "* Consider:\n", + " * A = \"i think pagerank is a link analysis algorithm used by google that uses a system of weights attached to each element of a hyperlinked set of documents\"\n", + " * S = \"pagerank is a link analysis algorithm used by the google internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents\"\n", + "\n", + "* In this case, we can see that the start of each sentence of fairly similar, having overlap in the sequence of words, \"pagerank is a link analysis algorithm used by\" before diverging slightly. Then we **continue moving left -to-right along both texts** until we see the next common sequence; in this case it is only one word, \"google\". Next we find \"that\" and \"a\" and finally the same ending \"to each element of a hyperlinked set of documents\".\n", + "* Below, is a clear visual of how these sequences were found, sequentially, in each text.\n", + "\n", + "\n", + "\n", + "* Now, those words appear in left-to-right order in each document, sequentially, and even though there are some words in between, we count this as the longest common subsequence between the two texts. \n", + "* If I count up each word that I found in common I get the value 20. **So, LCS has length 20**. \n", + "* Next, to normalize this value, divide by the total length of the student answer; in this example that length is only 27. **So, the function `lcs_norm_word` should return the value `20/27` or about `0.7408`.**\n", + "\n", + "In this way, LCS is a great indicator of cut-and-paste plagiarism or if someone has referenced the same source text multiple times in an answer." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### LCS, dynamic programming\n", + "\n", + "If you read through the scenario above, you can see that this algorithm depends on looking at two texts and comparing them word by word. You can solve this problem in multiple ways. First, it may be useful to `.split()` each text into lists of comma separated words to compare. Then, you can iterate through each word in the texts and compare them, adding to your value for LCS as you go. \n", + "\n", + "The method I recommend for implementing an efficient LCS algorithm is: using a matrix and dynamic programming. **Dynamic programming** is all about breaking a larger problem into a smaller set of subproblems, and building up a complete result without having to repeat any subproblems. \n", + "\n", + "This approach assumes that you can split up a large LCS task into a combination of smaller LCS tasks. Let's look at a simple example that compares letters:\n", + "\n", + "* A = \"ABCD\"\n", + "* S = \"BD\"\n", + "\n", + "We can see right away that the longest subsequence of _letters_ here is 2 (B and D are in sequence in both strings). And we can calculate this by looking at relationships between each letter in the two strings, A and S.\n", + "\n", + "Here, I have a matrix with the letters of A on top and the letters of S on the left side:\n", + "\n", + "\n", + "\n", + "This starts out as a matrix that has as many columns and rows as letters in the strings S and O **+1** additional row and column, filled with zeros on the top and left sides. So, in this case, instead of a 2x4 matrix it is a 3x5.\n", + "\n", + "Now, we can fill this matrix up by breaking it into smaller LCS problems. For example, let's first look at the shortest substrings: the starting letter of A and S. We'll first ask, what is the Longest Common Subsequence between these two letters \"A\" and \"B\"? \n", + "\n", + "**Here, the answer is zero and we fill in the corresponding grid cell with that value.**\n", + "\n", + "\n", + "\n", + "Then, we ask the next question, what is the LCS between \"AB\" and \"B\"?\n", + "\n", + "**Here, we have a match, and can fill in the appropriate value 1**.\n", + "\n", + "\n", + "\n", + "If we continue, we get to a final matrix that looks as follows, with a **2** in the bottom right corner.\n", + "\n", + "\n", + "\n", + "The final LCS will be that value **2** *normalized* by the number of n-grams in A. So, our normalized value is 2/4 = **0.5**.\n", + "\n", + "### The matrix rules\n", + "\n", + "One thing to notice here is that, you can efficiently fill up this matrix one cell at a time. Each grid cell only depends on the values in the grid cells that are directly on top and to the left of it, or on the diagonal/top-left. The rules are as follows:\n", + "* Start with a matrix that has one extra row and column of zeros.\n", + "* As you traverse your string:\n", + " * If there is a match, fill that grid cell with the value to the top-left of that cell *plus* one. So, in our case, when we found a matching B-B, we added +1 to the value in the top-left of the matching cell, 0.\n", + " * If there is not a match, take the *maximum* value from either directly to the left or the top cell, and carry that value over to the non-match cell.\n", + "\n", + "\n", + "\n", + "After completely filling the matrix, **the bottom-right cell will hold the non-normalized LCS value**.\n", + "\n", + "This matrix treatment can be applied to a set of words instead of letters. Your function should apply this to the words in two texts and return the normalized LCS value." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute the normalized LCS given an answer text and a source text\n", + "def lcs_norm_word(answer_text, source_text):\n", + " '''Computes the longest common subsequence of words in two texts; returns a normalized value.\n", + " :param answer_text: The pre-processed text for an answer text\n", + " :param source_text: The pre-processed text for an answer's associated source text\n", + " :return: A normalized LCS value'''\n", + " \n", + " answer = answer_text.split()\n", + " source = source_text.split()\n", + " \n", + " lcs_matrix = np.zeros((len(answer) + 1, len(source) + 1))\n", + " row_index= 0\n", + " col_index = 0\n", + " for row_index in range(0, len(answer)):\n", + " answer_word = answer[row_index]\n", + " for col_index in range(0, len(source)):\n", + " source_word = source[col_index]\n", + " if source_word == answer_word:\n", + " lcs_matrix[row_index + 1][col_index + 1] = (lcs_matrix[row_index][col_index]) + 1\n", + " else: \n", + " lcs_matrix[row_index + 1][col_index + 1] = max(lcs_matrix[row_index][col_index + 1], \n", + " lcs_matrix[row_index + 1][col_index])\n", + "\n", + " normalized_lcs = lcs_matrix[len(answer)][len(source)] / len(answer)\n", + " print(normalized_lcs)\n", + " return normalized_lcs\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cells\n", + "\n", + "Let's start by testing out your code on the example given in the initial description.\n", + "\n", + "In the below cell, we have specified strings A (answer text) and S (original source text). We know that these texts have 20 words in common and the submitted answer is 27 words long, so the normalized, longest common subsequence should be 20/27.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7407407407407407\n", + "LCS = 0.7407407407407407\n", + "Test passed!\n" + ] + } + ], + "source": [ + "# Run the test scenario from above\n", + "# does your function return the expected value?\n", + "\n", + "A = \"i think pagerank is a link analysis algorithm used by google that uses a system of weights attached to each element of a hyperlinked set of documents\"\n", + "S = \"pagerank is a link analysis algorithm used by the google internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents\"\n", + "\n", + "# calculate LCS\n", + "lcs = lcs_norm_word(A, S)\n", + "print('LCS = ', lcs)\n", + "\n", + "\n", + "# expected value test\n", + "assert lcs==20/27., \"Incorrect LCS value, expected about 0.7408, got \"+str(lcs)\n", + "\n", + "print('Test passed!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This next cell runs a more rigorous test." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FileTaskCategoryClassTextDatatype
0g0pA_taska.txta00inheritance is a basic concept of object orien...train
1g0pA_taskb.txtb31pagerank is a link analysis algorithm used by ...test
2g0pA_taskc.txtc21the vector space model also called term vector...train
3g0pA_taskd.txtd11bayes theorem was names after rev thomas bayes...train
4g0pA_taske.txte00dynamic programming is an algorithm design tec...train
\n", + "
" + ], + "text/plain": [ + " File Task Category Class \\\n", + "0 g0pA_taska.txt a 0 0 \n", + "1 g0pA_taskb.txt b 3 1 \n", + "2 g0pA_taskc.txt c 2 1 \n", + "3 g0pA_taskd.txt d 1 1 \n", + "4 g0pA_taske.txt e 0 0 \n", + "\n", + " Text Datatype \n", + "0 inheritance is a basic concept of object orien... train \n", + "1 pagerank is a link analysis algorithm used by ... test \n", + "2 the vector space model also called term vector... train \n", + "3 bayes theorem was names after rev thomas bayes... train \n", + "4 dynamic programming is an algorithm design tec... train " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "complete_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.42783505154639173\n", + "0.1917808219178082\n", + "0.8207547169811321\n", + "0.8464912280701754\n", + "0.3160621761658031\n", + "0.24257425742574257\n", + "Tests Passed!\n" + ] + } + ], + "source": [ + "# run test cell\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# test lcs implementation\n", + "# params: complete_df from before, and lcs_norm_word function\n", + "tests.test_lcs(complete_df, lcs_norm_word)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, take a look at a few resultant values for `lcs_norm_word`. Just like before, you should see that higher values correspond to higher levels of plagiarism." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.1917808219178082\n", + "0.8207547169811321\n", + "0.8464912280701754\n", + "0.3160621761658031\n", + "0.24257425742574257\n", + "Original category values: \n", + " [0, 3, 2, 1, 0]\n", + "\n", + "Normalized LCS values: \n", + " [0.1917808219178082, 0.8207547169811321, 0.8464912280701754, 0.3160621761658031, 0.24257425742574257]\n" + ] + } + ], + "source": [ + "# test on your own\n", + "test_indices = range(5) # look at first few files\n", + "\n", + "category_vals = []\n", + "lcs_norm_vals = []\n", + "# iterate through first few docs and calculate LCS\n", + "for i in test_indices:\n", + " category_vals.append(complete_df.loc[i, 'Category'])\n", + " # get texts to compare\n", + " answer_text = complete_df.loc[i, 'Text'] \n", + " task = complete_df.loc[i, 'Task']\n", + " # we know that source texts have Class = -1\n", + " orig_rows = complete_df[(complete_df['Class'] == -1)]\n", + " orig_row = orig_rows[(orig_rows['Task'] == task)]\n", + " source_text = orig_row['Text'].values[0]\n", + " # calculate lcs\n", + " lcs_val = lcs_norm_word(answer_text, source_text)\n", + " lcs_norm_vals.append(lcs_val)\n", + "\n", + "# print out result, does it make sense?\n", + "print('Original category values: \\n', category_vals)\n", + "print()\n", + "print('Normalized LCS values: \\n', lcs_norm_vals)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Create All Features\n", + "\n", + "Now that you've completed the feature calculation functions, it's time to actually create multiple features and decide on which ones to use in your final model! In the below cells, you're provided two helper functions to help you create multiple features and store those in a DataFrame, `features_df`.\n", + "\n", + "### Creating multiple containment features\n", + "\n", + "Your completed `calculate_containment` function will be called in the next cell, which defines the helper function `create_containment_features`. \n", + "\n", + "> This function returns a list of containment features, calculated for a given `n` and for *all* files in a df (assumed to the the `complete_df`).\n", + "\n", + "For our original files, the containment value is set to a special value, -1.\n", + "\n", + "This function gives you the ability to easily create several containment features, of different n-gram lengths, for each of our text files." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# Function returns a list of containment features, calculated for a given n \n", + "# Should return a list of length 100 for all files in a complete_df\n", + "def create_containment_features(df, n, column_name=None):\n", + " \n", + " containment_values = []\n", + " \n", + " if(column_name==None):\n", + " column_name = 'c_'+str(n) # c_1, c_2, .. c_n\n", + " \n", + " # iterates through dataframe rows\n", + " for i in df.index:\n", + " file = df.loc[i, 'File']\n", + " # Computes features using calculate_containment function\n", + " if df.loc[i,'Category'] > -1:\n", + " c = calculate_containment(df, n, file)\n", + " containment_values.append(c)\n", + " # Sets value to -1 for original tasks \n", + " else:\n", + " containment_values.append(-1)\n", + " \n", + " print(str(n)+'-gram containment features created!')\n", + " return containment_values\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating LCS features\n", + "\n", + "Below, your complete `lcs_norm_word` function is used to create a list of LCS features for all the answer files in a given DataFrame (again, this assumes you are passing in the `complete_df`. It assigns a special value for our original, source files, -1.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# Function creates lcs feature and add it to the dataframe\n", + "def create_lcs_features(df, column_name='lcs_word'):\n", + " \n", + " lcs_values = []\n", + " \n", + " # iterate through files in dataframe\n", + " for i in df.index:\n", + " # Computes LCS_norm words feature using function above for answer tasks\n", + " if df.loc[i,'Category'] > -1:\n", + " # get texts to compare\n", + " answer_text = df.loc[i, 'Text'] \n", + " task = df.loc[i, 'Task']\n", + " # we know that source texts have Class = -1\n", + " orig_rows = df[(df['Class'] == -1)]\n", + " orig_row = orig_rows[(orig_rows['Task'] == task)]\n", + " source_text = orig_row['Text'].values[0]\n", + "\n", + " # calculate lcs\n", + " lcs = lcs_norm_word(answer_text, source_text)\n", + " lcs_values.append(lcs)\n", + " # Sets to -1 for original tasks \n", + " else:\n", + " lcs_values.append(-1)\n", + "\n", + " print('LCS features created!')\n", + " return lcs_values\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Create a features DataFrame by selecting an `ngram_range`\n", + "\n", + "The paper suggests calculating the following features: containment *1-gram to 5-gram* and *longest common subsequence*. \n", + "> In this exercise, you can choose to create even more features, for example from *1-gram to 7-gram* containment features and *longest common subsequence*. \n", + "\n", + "You'll want to create at least 6 features to choose from as you think about which to give to your final, classification model. Defining and comparing at least 6 different features allows you to discard any features that seem redundant, and choose to use the best features for your final model!\n", + "\n", + "In the below cell **define an n-gram range**; these will be the n's you use to create n-gram containment features. The rest of the feature creation code is provided." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1-gram containment features created!\n", + "2-gram containment features created!\n", + "3-gram containment features created!\n", + "4-gram containment features created!\n", + "5-gram containment features created!\n", + "6-gram containment features created!\n", + "0.1917808219178082\n", + "0.8207547169811321\n", + "0.8464912280701754\n", + "0.3160621761658031\n", + "0.24257425742574257\n", + "0.16117216117216118\n", + "0.30165289256198347\n", + "0.6217105263157895\n", + "0.484304932735426\n", + "0.597457627118644\n", + "0.42783505154639173\n", + "0.2708333333333333\n", + "0.22395833333333334\n", + "0.9\n", + "0.8940397350993378\n", + "0.8232044198895028\n", + "0.775\n", + "0.45977011494252873\n", + "0.3055555555555556\n", + "0.2826086956521739\n", + "0.9930555555555556\n", + "0.7888888888888889\n", + "0.3246753246753247\n", + "0.3466666666666667\n", + "1.0\n", + "0.18932038834951456\n", + "0.36893203883495146\n", + "0.4166666666666667\n", + "0.4898785425101215\n", + "0.24742268041237114\n", + "0.21875\n", + "0.29441624365482233\n", + "0.5163934426229508\n", + "0.4725274725274725\n", + "0.6064516129032258\n", + "0.536697247706422\n", + "0.39436619718309857\n", + "0.25833333333333336\n", + "0.2789115646258503\n", + "0.3431372549019608\n", + "0.15302491103202848\n", + "0.4559386973180077\n", + "0.82\n", + "0.45\n", + "0.22935779816513763\n", + "0.16535433070866143\n", + "0.26046511627906976\n", + "0.3415841584158416\n", + "0.9294117647058824\n", + "1.0\n", + "0.6699029126213593\n", + "0.3551912568306011\n", + "0.23376623376623376\n", + "0.3492647058823529\n", + "0.3476190476190476\n", + "0.5677233429394812\n", + "0.774390243902439\n", + "0.19298245614035087\n", + "0.21818181818181817\n", + "0.26666666666666666\n", + "0.22110552763819097\n", + "0.5047169811320755\n", + "0.5585585585585585\n", + "0.9966996699669967\n", + "0.2289156626506024\n", + "0.1722488038277512\n", + "0.23684210526315788\n", + "0.29493087557603687\n", + "0.5037593984962406\n", + "0.9117647058823529\n", + "0.9923076923076923\n", + "0.2833333333333333\n", + "0.2616822429906542\n", + "0.6470588235294118\n", + "0.85\n", + "0.178743961352657\n", + "0.2350230414746544\n", + "0.6619718309859155\n", + "0.7911111111111111\n", + "0.9298245614035088\n", + "0.8546712802768166\n", + "0.2983425414364641\n", + "0.2230769230769231\n", + "0.9270833333333334\n", + "0.9098039215686274\n", + "0.4900990099009901\n", + "0.25203252032520324\n", + "0.1774193548387097\n", + "0.22767857142857142\n", + "0.6437246963562753\n", + "0.24271844660194175\n", + "0.8395061728395061\n", + "0.2830188679245283\n", + "0.16176470588235295\n", + "0.24583333333333332\n", + "LCS features created!\n", + "\n", + "Features: ['c_1', 'c_2', 'c_3', 'c_4', 'c_5', 'c_6', 'lcs_word']\n", + "\n" + ] + } + ], + "source": [ + "# Define an ngram range\n", + "ngram_range = range(1,7)\n", + "\n", + "\n", + "# The following code may take a minute to run, depending on your ngram_range\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "features_list = []\n", + "\n", + "# Create features in a features_df\n", + "all_features = np.zeros((len(ngram_range)+1, len(complete_df)))\n", + "\n", + "# Calculate features for containment for ngrams in range\n", + "i=0\n", + "for n in ngram_range:\n", + " column_name = 'c_'+str(n)\n", + " features_list.append(column_name)\n", + " # create containment features\n", + " all_features[i]=np.squeeze(create_containment_features(complete_df, n))\n", + " i+=1\n", + "\n", + "# Calculate features for LCS_Norm Words \n", + "features_list.append('lcs_word')\n", + "all_features[i]= np.squeeze(create_lcs_features(complete_df))\n", + "\n", + "# create a features dataframe\n", + "features_df = pd.DataFrame(np.transpose(all_features), columns=features_list)\n", + "\n", + "# Print all features/columns\n", + "print()\n", + "print('Features: ', features_list)\n", + "print()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
c_1c_2c_3c_4c_5c_6lcs_word
00.3981480.0790700.0093460.0000000.0000000.0000000.191781
11.0000000.9846940.9641030.9432990.9222800.9010420.820755
20.8693690.7194570.6136360.5159820.4495410.3824880.846491
30.5935830.2688170.1567570.1086960.0819670.0604400.316062
40.5445030.1157890.0317460.0053190.0000000.0000000.242574
50.3295020.0538460.0077220.0038760.0000000.0000000.161172
60.5903080.1504420.0355560.0044640.0000000.0000000.301653
70.7653060.7098980.6643840.6254300.5896550.5536330.621711
80.7597770.5056180.3954800.3068180.2457140.1954020.484305
90.8844440.5267860.3408070.2477480.1809950.1500000.597458
\n", + "
" + ], + "text/plain": [ + " c_1 c_2 c_3 c_4 c_5 c_6 lcs_word\n", + "0 0.398148 0.079070 0.009346 0.000000 0.000000 0.000000 0.191781\n", + "1 1.000000 0.984694 0.964103 0.943299 0.922280 0.901042 0.820755\n", + "2 0.869369 0.719457 0.613636 0.515982 0.449541 0.382488 0.846491\n", + "3 0.593583 0.268817 0.156757 0.108696 0.081967 0.060440 0.316062\n", + "4 0.544503 0.115789 0.031746 0.005319 0.000000 0.000000 0.242574\n", + "5 0.329502 0.053846 0.007722 0.003876 0.000000 0.000000 0.161172\n", + "6 0.590308 0.150442 0.035556 0.004464 0.000000 0.000000 0.301653\n", + "7 0.765306 0.709898 0.664384 0.625430 0.589655 0.553633 0.621711\n", + "8 0.759777 0.505618 0.395480 0.306818 0.245714 0.195402 0.484305\n", + "9 0.884444 0.526786 0.340807 0.247748 0.180995 0.150000 0.597458" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# print some results \n", + "features_df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Correlated Features\n", + "\n", + "You should use feature correlation across the *entire* dataset to determine which features are ***too*** **highly-correlated** with each other to include both features in a single model. For this analysis, you can use the *entire* dataset due to the small sample size we have. \n", + "\n", + "All of our features try to measure the similarity between two texts. Since our features are designed to measure similarity, it is expected that these features will be highly-correlated. Many classification models, for example a Naive Bayes classifier, rely on the assumption that features are *not* highly correlated; highly-correlated features may over-inflate the importance of a single feature. \n", + "\n", + "So, you'll want to choose your features based on which pairings have the lowest correlation. These correlation values range between 0 and 1; from low to high correlation, and are displayed in a [correlation matrix](https://www.displayr.com/what-is-a-correlation-matrix/), below." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
c_1c_2c_3c_4c_5c_6lcs_word
c_11.000.940.900.890.880.870.97
c_20.941.000.990.980.970.960.98
c_30.900.991.001.000.990.980.97
c_40.890.981.001.001.000.990.95
c_50.880.970.991.001.001.000.95
c_60.870.960.980.991.001.000.94
lcs_word0.970.980.970.950.950.941.00
\n", + "
" + ], + "text/plain": [ + " c_1 c_2 c_3 c_4 c_5 c_6 lcs_word\n", + "c_1 1.00 0.94 0.90 0.89 0.88 0.87 0.97\n", + "c_2 0.94 1.00 0.99 0.98 0.97 0.96 0.98\n", + "c_3 0.90 0.99 1.00 1.00 0.99 0.98 0.97\n", + "c_4 0.89 0.98 1.00 1.00 1.00 0.99 0.95\n", + "c_5 0.88 0.97 0.99 1.00 1.00 1.00 0.95\n", + "c_6 0.87 0.96 0.98 0.99 1.00 1.00 0.94\n", + "lcs_word 0.97 0.98 0.97 0.95 0.95 0.94 1.00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# Create correlation matrix for just Features to determine different models to test\n", + "corr_matrix = features_df.corr().abs().round(2)\n", + "\n", + "# display shows all of a dataframe\n", + "display(corr_matrix)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Create selected train/test data\n", + "\n", + "Complete the `train_test_data` function below. This function should take in the following parameters:\n", + "* `complete_df`: A DataFrame that contains all of our processed text data, file info, datatypes, and class labels\n", + "* `features_df`: A DataFrame of all calculated features, such as containment for ngrams, n= 1-5, and lcs values for each text file listed in the `complete_df` (this was created in the above cells)\n", + "* `selected_features`: A list of feature column names, ex. `['c_1', 'lcs_word']`, which will be used to select the final features in creating train/test sets of data.\n", + "\n", + "It should return two tuples:\n", + "* `(train_x, train_y)`, selected training features and their corresponding class labels (0/1)\n", + "* `(test_x, test_y)`, selected training features and their corresponding class labels (0/1)\n", + "\n", + "** Note: x and y should be arrays of feature values and numerical class labels, respectively; not DataFrames.**\n", + "\n", + "Looking at the above correlation matrix, you should decide on a **cutoff** correlation value, less than 1.0, to determine which sets of features are *too* highly-correlated to be included in the final training and test data. If you cannot find features that are less correlated than some cutoff value, it is suggested that you increase the number of features (longer n-grams) to choose from or use *only one or two* features in your final model to avoid introducing highly-correlated features.\n", + "\n", + "Recall that the `complete_df` has a `Datatype` column that indicates whether data should be `train` or `test` data; this should help you split the data appropriately." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Takes in dataframes and a list of selected features (column names) \n", + "# and returns (train_x, train_y), (test_x, test_y)\n", + "def train_test_data(complete_df, features_df, selected_features):\n", + " '''Gets selected training and test features from given dataframes, and \n", + " returns tuples for training and test features and their corresponding class labels.\n", + " :param complete_df: A dataframe with all of our processed text data, datatypes, and labels\n", + " :param features_df: A dataframe of all computed, similarity features\n", + " :param selected_features: An array of selected features that correspond to certain columns in `features_df`\n", + " :return: training and test features and labels: (train_x, train_y), (test_x, test_y)'''\n", + " \n", + " merged_df = complete_df.merge(features_df, left_index=True, right_index=True)\n", + " \n", + " # get the training features\n", + " train_x = merged_df.loc[merged_df.Datatype == 'train', selected_features].values\n", + " # And training class labels (0 or 1)\n", + " train_y = merged_df.loc[merged_df.Datatype == 'train', 'Class'].values\n", + " \n", + " # get the test features and labels\n", + " test_x = merged_df.loc[merged_df.Datatype == 'test', selected_features].values\n", + " test_y = merged_df.loc[merged_df.Datatype == 'test', 'Class'].values\n", + " \n", + " return (train_x, train_y), (test_x, test_y)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cells\n", + "\n", + "Below, test out your implementation and create the final train/test data." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tests Passed!\n" + ] + } + ], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "test_selection = list(features_df)[:2] # first couple columns as a test\n", + "# test that the correct train/test data is created\n", + "(train_x, train_y), (test_x, test_y) = train_test_data(complete_df, features_df, test_selection)\n", + "\n", + "# params: generated train/test data\n", + "tests.test_data_split(train_x, train_y, test_x, test_y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Select \"good\" features\n", + "\n", + "If you passed the test above, you can create your own train/test data, below. \n", + "\n", + "Define a list of features you'd like to include in your final mode, `selected_features`; this is a list of the features names you want to include." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training size: 70\n", + "Test size: 25\n", + "\n", + "Training df sample: \n", + " [[0.39814815 0. 0.19178082]\n", + " [0.86936937 0.44954128 0.84649123]\n", + " [0.59358289 0.08196721 0.31606218]\n", + " [0.54450262 0. 0.24257426]\n", + " [0.32950192 0. 0.16117216]\n", + " [0.59030837 0. 0.30165289]\n", + " [0.75977654 0.24571429 0.48430493]\n", + " [0.51612903 0. 0.27083333]\n", + " [0.44086022 0. 0.22395833]\n", + " [0.97945205 0.78873239 0.9 ]]\n" + ] + } + ], + "source": [ + "# Select your list of features, this should be column names from features_df\n", + "# ex. ['c_1', 'lcs_word']\n", + "selected_features = ['c_1', 'c_5', 'lcs_word']\n", + "\n", + "\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "\n", + "(train_x, train_y), (test_x, test_y) = train_test_data(complete_df, features_df, selected_features)\n", + "\n", + "# check that division of samples seems correct\n", + "# these should add up to 95 (100 - 5 original files)\n", + "print('Training size: ', len(train_x))\n", + "print('Test size: ', len(test_x))\n", + "print()\n", + "print('Training df sample: \\n', train_x[:10])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question 2: How did you decide on which features to include in your final model? " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Answer:** We decide features based on uniqueness and their correlation to other features. \n", + "For example, in this model, c_1 and lcs_word represent single words common and longest common subsequence respectively, which are totally unique and hence must be selected.\n", + "Now we see c_2, c_3, c_4, c_5 are highly correlated to one another. So selecting one of them should do the work. We finally settle on selecting c_5 as it is least correlated to c_1 and lcs_word, and hence more unique than other features.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Creating Final Data Files\n", + "\n", + "Now, you are almost ready to move on to training a model in SageMaker!\n", + "\n", + "You'll want to access your train and test data in SageMaker and upload it to S3. In this project, SageMaker will expect the following format for your train/test data:\n", + "* Training and test data should be saved in one `.csv` file each, ex `train.csv` and `test.csv`\n", + "* These files should have class labels in the first column and features in the rest of the columns\n", + "\n", + "This format follows the practice, outlined in the [SageMaker documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html), which reads: \"Amazon SageMaker requires that a CSV file doesn't have a header record and that the target variable [class label] is in the first column.\"\n", + "\n", + "## EXERCISE: Create csv files\n", + "\n", + "Define a function that takes in x (features) and y (labels) and saves them to one `.csv` file at the path `data_dir/filename`.\n", + "\n", + "It may be useful to use pandas to merge your features and labels into one DataFrame and then convert that into a csv file. You can make sure to get rid of any incomplete rows, in a DataFrame, by using `dropna`." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def make_csv(x, y, filename, data_dir):\n", + " '''Merges features and labels and converts them into one csv file with labels in the first column.\n", + " :param x: Data features\n", + " :param y: Data labels\n", + " :param file_name: Name of csv file, ex. 'train.csv'\n", + " :param data_dir: The directory where files will be saved\n", + " '''\n", + " # make data dir, if it does not exist\n", + " if not os.path.exists(data_dir):\n", + " os.makedirs(data_dir)\n", + " \n", + " # your code here\n", + " df = pd.concat([pd.DataFrame(y), pd.DataFrame(x)], axis=1).dropna()\n", + " df.to_csv(os.path.join(data_dir, filename), header=False, index=False)\n", + " \n", + " # nothing is returned, but a print statement indicates that the function has run\n", + " print('Path created: '+str(data_dir)+'/'+str(filename))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cells\n", + "\n", + "Test that your code produces the correct format for a `.csv` file, given some text features and labels." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path created: test_csv/to_delete.csv\n", + "Tests passed!\n" + ] + } + ], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "fake_x = [ [0.39814815, 0.0001, 0.19178082], \n", + " [0.86936937, 0.44954128, 0.84649123], \n", + " [0.44086022, 0., 0.22395833] ]\n", + "\n", + "fake_y = [0, 1, 1]\n", + "\n", + "make_csv(fake_x, fake_y, filename='to_delete.csv', data_dir='test_csv')\n", + "\n", + "# read in and test dimensions\n", + "fake_df = pd.read_csv('test_csv/to_delete.csv', header=None)\n", + "\n", + "# check shape\n", + "assert fake_df.shape==(3, 4), \\\n", + " 'The file should have as many rows as data_points and as many columns as features+1 (for indices).'\n", + "# check that first column = labels\n", + "assert np.all(fake_df.iloc[:,0].values==fake_y), 'First column is not equal to the labels, fake_y.'\n", + "print('Tests passed!')" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# delete the test csv file, generated above\n", + "! rm -rf test_csv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you've passed the tests above, run the following cell to create `train.csv` and `test.csv` files in a directory that you specify! This will save the data in a local directory. Remember the name of this directory because you will reference it again when uploading this data to S3." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path created: plagiarism_data/train.csv\n", + "Path created: plagiarism_data/test.csv\n" + ] + } + ], + "source": [ + "# can change directory, if you want\n", + "data_dir = 'plagiarism_data'\n", + "\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "\n", + "make_csv(train_x, train_y, filename='train.csv', data_dir=data_dir)\n", + "make_csv(test_x, test_y, filename='test.csv', data_dir=data_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Up Next\n", + "\n", + "Now that you've done some feature engineering and created some training and test data, you are ready to train and deploy a plagiarism classification model. The next notebook will utilize SageMaker resources to train and test a model that you design." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_amazonei_mxnet_p36", + "language": "python", + "name": "conda_amazonei_mxnet_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/3_Training_a_Model.ipynb b/3_Training_a_Model.ipynb new file mode 100644 index 0000000..769f066 --- /dev/null +++ b/3_Training_a_Model.ipynb @@ -0,0 +1,789 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plagiarism Detection Model\n", + "\n", + "Now that you've created training and test data, you are ready to define and train a model. Your goal in this notebook, will be to train a binary classification model that learns to label an answer file as either plagiarized or not, based on the features you provide the model.\n", + "\n", + "This task will be broken down into a few discrete steps:\n", + "\n", + "* Upload your data to S3.\n", + "* Define a binary classification model and a training script.\n", + "* Train your model and deploy it.\n", + "* Evaluate your deployed classifier and answer some questions about your approach.\n", + "\n", + "To complete this notebook, you'll have to complete all given exercises and answer all the questions in this notebook.\n", + "> All your tasks will be clearly labeled **EXERCISE** and questions as **QUESTION**.\n", + "\n", + "It will be up to you to explore different classification models and decide on a model that gives you the best performance for this dataset.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load Data to S3\n", + "\n", + "In the last notebook, you should have created two files: a `training.csv` and `test.csv` file with the features and class labels for the given corpus of plagiarized/non-plagiarized text data. \n", + "\n", + ">The below cells load in some AWS SageMaker libraries and creates a default bucket. After creating this bucket, you can upload your locally stored data to S3.\n", + "\n", + "Save your train and test `.csv` feature files, locally. To do this you can run the second notebook \"2_Plagiarism_Feature_Engineering\" in SageMaker or you can manually upload your files to this notebook using the upload icon in Jupyter Lab. Then you can upload local files to S3 by using `sagemaker_session.upload_data` and pointing directly to where the training data is saved." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import boto3\n", + "import sagemaker" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# session and role\n", + "sagemaker_session = sagemaker.Session()\n", + "role = sagemaker.get_execution_role()\n", + "\n", + "# create an S3 bucket\n", + "bucket = sagemaker_session.default_bucket()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Upload your training data to S3\n", + "\n", + "Specify the `data_dir` where you've saved your `train.csv` file. Decide on a descriptive `prefix` that defines where your data will be uploaded in the default S3 bucket. Finally, create a pointer to your training data by calling `sagemaker_session.upload_data` and passing in the required parameters. It may help to look at the [Session documentation](https://sagemaker.readthedocs.io/en/stable/session.html#sagemaker.session.Session.upload_data) or previous SageMaker code examples.\n", + "\n", + "You are expected to upload your entire directory. Later, the training script will only access the `train.csv` file." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# should be the name of directory you created to save your features data\n", + "data_dir = 'plagiarism_data'\n", + "\n", + "# set prefix, a descriptive name for a directory \n", + "prefix = 'sagemaker/plagiarism_detector'\n", + "\n", + "# upload all data to S3\n", + "input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test cell\n", + "\n", + "Test that your data has been successfully uploaded. The below cell prints out the items in your S3 bucket and will throw an error if it is empty. You should see the contents of your `data_dir` and perhaps some checkpoints. If you see any other files listed, then you may have some old model files that you can delete via the S3 console (though, additional files shouldn't affect the performance of model developed in this notebook)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sagemaker-scikit-learn-2020-01-07-09-01-41-326/debug-output/training_job_end.ts\n", + "sagemaker-scikit-learn-2020-01-07-09-01-41-326/output/model.tar.gz\n", + "sagemaker-scikit-learn-2020-01-07-09-01-41-326/source/sourcedir.tar.gz\n", + "sagemaker/plagiarism_detector/test.csv\n", + "sagemaker/plagiarism_detector/train.csv\n", + "Test passed!\n" + ] + } + ], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# confirm that data is in S3 bucket\n", + "empty_check = []\n", + "for obj in boto3.resource('s3').Bucket(bucket).objects.all():\n", + " empty_check.append(obj.key)\n", + " print(obj.key)\n", + "\n", + "assert len(empty_check) !=0, 'S3 bucket is empty.'\n", + "print('Test passed!')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Modeling\n", + "\n", + "Now that you've uploaded your training data, it's time to define and train a model!\n", + "\n", + "The type of model you create is up to you. For a binary classification task, you can choose to go one of three routes:\n", + "* Use a built-in classification algorithm, like LinearLearner.\n", + "* Define a custom Scikit-learn classifier, a comparison of models can be found [here](https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html).\n", + "* Define a custom PyTorch neural network classifier. \n", + "\n", + "It will be up to you to test out a variety of models and choose the best one. Your project will be graded on the accuracy of your final model. \n", + " \n", + "---\n", + "\n", + "## EXERCISE: Complete a training script \n", + "\n", + "To implement a custom classifier, you'll need to complete a `train.py` script. You've been given the folders `source_sklearn` and `source_pytorch` which hold starting code for a custom Scikit-learn model and a PyTorch model, respectively. Each directory has a `train.py` training script. To complete this project **you only need to complete one of these scripts**; the script that is responsible for training your final model.\n", + "\n", + "A typical training script:\n", + "* Loads training data from a specified directory\n", + "* Parses any training & model hyperparameters (ex. nodes in a neural network, training epochs, etc.)\n", + "* Instantiates a model of your design, with any specified hyperparams\n", + "* Trains that model \n", + "* Finally, saves the model so that it can be hosted/deployed, later\n", + "\n", + "### Defining and training a model\n", + "Much of the training script code is provided for you. Almost all of your work will be done in the `if __name__ == '__main__':` section. To complete a `train.py` file, you will:\n", + "1. Import any extra libraries you need\n", + "2. Define any additional model training hyperparameters using `parser.add_argument`\n", + "2. Define a model in the `if __name__ == '__main__':` section\n", + "3. Train the model in that same section\n", + "\n", + "Below, you can use `!pygmentize` to display an existing `train.py` file. Read through the code; all of your tasks are marked with `TODO` comments. \n", + "\n", + "**Note: If you choose to create a custom PyTorch model, you will be responsible for defining the model in the `model.py` file,** and a `predict.py` file is provided. If you choose to use Scikit-learn, you only need a `train.py` file; you may import a classifier from the `sklearn` library." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36m__future__\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m print_function\r\n", + "\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36margparse\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mos\u001b[39;49;00m\r\n", + "\u001b[34mimport\u001b[39;49;00m \u001b[04m\u001b[36mpandas\u001b[39;49;00m \u001b[34mas\u001b[39;49;00m \u001b[04m\u001b[36mpd\u001b[39;49;00m\r\n", + "\r\n", + "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36msklearn.externals\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m joblib\r\n", + "\r\n", + "\u001b[37m## TODO: Import any additional libraries you need to define a model\u001b[39;49;00m\r\n", + "\u001b[34mfrom\u001b[39;49;00m \u001b[04m\u001b[36msklearn.linear_model\u001b[39;49;00m \u001b[34mimport\u001b[39;49;00m LogisticRegression\r\n", + "\r\n", + "\u001b[37m# Provided model load function\u001b[39;49;00m\r\n", + "\u001b[34mdef\u001b[39;49;00m \u001b[32mmodel_fn\u001b[39;49;00m(model_dir):\r\n", + " \u001b[33m\"\"\"Load model from the model_dir. This is the same model that is saved\u001b[39;49;00m\r\n", + "\u001b[33m in the main if statement.\u001b[39;49;00m\r\n", + "\u001b[33m \"\"\"\u001b[39;49;00m\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m\"\u001b[39;49;00m\u001b[33mLoading model.\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + " \r\n", + " \u001b[37m# load using joblib\u001b[39;49;00m\r\n", + " model = joblib.load(os.path.join(model_dir, \u001b[33m\"\u001b[39;49;00m\u001b[33mmodel.joblib\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m))\r\n", + " \u001b[34mprint\u001b[39;49;00m(\u001b[33m\"\u001b[39;49;00m\u001b[33mDone loading model.\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m)\r\n", + " \r\n", + " \u001b[34mreturn\u001b[39;49;00m model\r\n", + "\r\n", + "\r\n", + "\u001b[37m## TODO: Complete the main code\u001b[39;49;00m\r\n", + "\u001b[34mif\u001b[39;49;00m \u001b[31m__name__\u001b[39;49;00m == \u001b[33m'\u001b[39;49;00m\u001b[33m__main__\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m:\r\n", + " \r\n", + " \u001b[37m# All of the model parameters and training parameters are sent as arguments\u001b[39;49;00m\r\n", + " \u001b[37m# when this script is executed, during a training job\u001b[39;49;00m\r\n", + " \r\n", + " \u001b[37m# Here we set up an argument parser to easily access the parameters\u001b[39;49;00m\r\n", + " parser = argparse.ArgumentParser()\r\n", + "\r\n", + " \u001b[37m# SageMaker parameters, like the directories for training data and saving models; set automatically\u001b[39;49;00m\r\n", + " \u001b[37m# Do not need to change\u001b[39;49;00m\r\n", + " parser.add_argument(\u001b[33m'\u001b[39;49;00m\u001b[33m--output-data-dir\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mstr\u001b[39;49;00m, default=os.environ[\u001b[33m'\u001b[39;49;00m\u001b[33mSM_OUTPUT_DATA_DIR\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m])\r\n", + " parser.add_argument(\u001b[33m'\u001b[39;49;00m\u001b[33m--model-dir\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mstr\u001b[39;49;00m, default=os.environ[\u001b[33m'\u001b[39;49;00m\u001b[33mSM_MODEL_DIR\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m])\r\n", + " parser.add_argument(\u001b[33m'\u001b[39;49;00m\u001b[33m--data-dir\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m, \u001b[36mtype\u001b[39;49;00m=\u001b[36mstr\u001b[39;49;00m, default=os.environ[\u001b[33m'\u001b[39;49;00m\u001b[33mSM_CHANNEL_TRAIN\u001b[39;49;00m\u001b[33m'\u001b[39;49;00m])\r\n", + " \r\n", + " \u001b[37m## TODO: Add any additional arguments that you will need to pass into your model\u001b[39;49;00m\r\n", + " \r\n", + " \u001b[37m# args holds all passed-in arguments\u001b[39;49;00m\r\n", + " args = parser.parse_args()\r\n", + "\r\n", + " \u001b[37m# Read in csv training file\u001b[39;49;00m\r\n", + " training_dir = args.data_dir\r\n", + " train_data = pd.read_csv(os.path.join(training_dir, \u001b[33m\"\u001b[39;49;00m\u001b[33mtrain.csv\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m), header=\u001b[36mNone\u001b[39;49;00m, names=\u001b[36mNone\u001b[39;49;00m)\r\n", + "\r\n", + " \u001b[37m# Labels are in the first column\u001b[39;49;00m\r\n", + " train_y = train_data.iloc[:,\u001b[34m0\u001b[39;49;00m]\r\n", + " train_x = train_data.iloc[:,\u001b[34m1\u001b[39;49;00m:]\r\n", + " \r\n", + " \r\n", + " \u001b[37m## --- Your code here --- ##\u001b[39;49;00m\r\n", + " \r\n", + "\r\n", + " \u001b[37m## TODO: Define a model \u001b[39;49;00m\r\n", + " model = LogisticRegression()\r\n", + " \r\n", + " \r\n", + " \u001b[37m## TODO: Train the model\u001b[39;49;00m\r\n", + " model.fit(train_x, train_y)\r\n", + " \r\n", + " \r\n", + " \u001b[37m## --- End of your code --- ##\u001b[39;49;00m\r\n", + " \r\n", + "\r\n", + " \u001b[37m# Save the trained model\u001b[39;49;00m\r\n", + " joblib.dump(model, os.path.join(args.model_dir, \u001b[33m\"\u001b[39;49;00m\u001b[33mmodel.joblib\u001b[39;49;00m\u001b[33m\"\u001b[39;49;00m))\r\n" + ] + } + ], + "source": [ + "# directory can be changed to: source_sklearn or source_pytorch\n", + "!pygmentize source_sklearn/train.py" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Provided code\n", + "\n", + "If you read the code above, you can see that the starter code includes a few things:\n", + "* Model loading (`model_fn`) and saving code\n", + "* Getting SageMaker's default hyperparameters\n", + "* Loading the training data by name, `train.csv` and extracting the features and labels, `train_x`, and `train_y`\n", + "\n", + "If you'd like to read more about model saving with [joblib for sklearn](https://scikit-learn.org/stable/modules/model_persistence.html) or with [torch.save](https://pytorch.org/tutorials/beginner/saving_loading_models.html), click on the provided links." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Create an Estimator\n", + "\n", + "When a custom model is constructed in SageMaker, an entry point must be specified. This is the Python file which will be executed when the model is trained; the `train.py` function you specified above. To run a custom training script in SageMaker, construct an estimator, and fill in the appropriate constructor arguments:\n", + "\n", + "* **entry_point**: The path to the Python script SageMaker runs for training and prediction.\n", + "* **source_dir**: The path to the training script directory `source_sklearn` OR `source_pytorch`.\n", + "* **entry_point**: The path to the Python script SageMaker runs for training and prediction.\n", + "* **source_dir**: The path to the training script directory `train_sklearn` OR `train_pytorch`.\n", + "* **entry_point**: The path to the Python script SageMaker runs for training.\n", + "* **source_dir**: The path to the training script directory `train_sklearn` OR `train_pytorch`.\n", + "* **role**: Role ARN, which was specified, above.\n", + "* **train_instance_count**: The number of training instances (should be left at 1).\n", + "* **train_instance_type**: The type of SageMaker instance for training. Note: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.\n", + "* **sagemaker_session**: The session used to train on Sagemaker.\n", + "* **hyperparameters** (optional): A dictionary `{'name':value, ..}` passed to the train function as hyperparameters.\n", + "\n", + "Note: For a PyTorch model, there is another optional argument **framework_version**, which you can set to the latest version of PyTorch, `1.0`.\n", + "\n", + "## EXERCISE: Define a Scikit-learn or PyTorch estimator\n", + "\n", + "To import your desired estimator, use one of the following lines:\n", + "```\n", + "from sagemaker.sklearn.estimator import SKLearn\n", + "```\n", + "```\n", + "from sagemaker.pytorch import PyTorch\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# your import and estimator code, here\n", + "from sagemaker.sklearn.estimator import SKLearn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Train the estimator\n", + "\n", + "Train your estimator on the training data stored in S3. This should create a training job that you can monitor in your SageMaker console." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 100 µs, sys: 8 µs, total: 108 µs\n", + "Wall time: 113 µs\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# Train your estimator on S3 training data\n", + "estimator = SKLearn(role=role,\n", + " sagemaker_session=sagemaker_session,\n", + " train_instance_count=1,\n", + " train_instance_type='ml.m4.xlarge',\n", + " entry_point='train.py',\n", + " source_dir='source_sklearn'\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-01-07 09:09:49 Starting - Starting the training job...\n", + "2020-01-07 09:09:52 Starting - Launching requested ML instances......\n", + "2020-01-07 09:11:00 Starting - Preparing the instances for training......\n", + "2020-01-07 09:11:59 Downloading - Downloading input data...\n", + "2020-01-07 09:12:45 Training - Training image download completed. Training in progress..\u001b[34m2020-01-07 09:12:46,565 sagemaker-containers INFO Imported framework sagemaker_sklearn_container.training\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,567 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,579 sagemaker_sklearn_container.training INFO Invoking user training script.\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,879 sagemaker-containers INFO Module train does not provide a setup.py. \u001b[0m\n", + "\u001b[34mGenerating setup.py\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,879 sagemaker-containers INFO Generating setup.cfg\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,879 sagemaker-containers INFO Generating MANIFEST.in\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:46,879 sagemaker-containers INFO Installing module with the following command:\u001b[0m\n", + "\u001b[34m/miniconda3/bin/python -m pip install . \u001b[0m\n", + "\u001b[34mProcessing /opt/ml/code\u001b[0m\n", + "\u001b[34mBuilding wheels for collected packages: train\n", + " Building wheel for train (setup.py): started\n", + " Building wheel for train (setup.py): finished with status 'done'\n", + " Created wheel for train: filename=train-1.0.0-py2.py3-none-any.whl size=5830 sha256=89f0f979d7c997c9fa98f05ad318aa496237540267e8f3bbaa7891a289b94c0c\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-tkej929b/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3\u001b[0m\n", + "\u001b[34mSuccessfully built train\u001b[0m\n", + "\u001b[34mInstalling collected packages: train\u001b[0m\n", + "\u001b[34mSuccessfully installed train-1.0.0\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:48,410 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:48,423 sagemaker-containers INFO Invoking user script\n", + "\u001b[0m\n", + "\u001b[34mTraining Env:\n", + "\u001b[0m\n", + "\u001b[34m{\n", + " \"additional_framework_parameters\": {},\n", + " \"channel_input_dirs\": {\n", + " \"train\": \"/opt/ml/input/data/train\"\n", + " },\n", + " \"current_host\": \"algo-1\",\n", + " \"framework_module\": \"sagemaker_sklearn_container.training:main\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"hyperparameters\": {},\n", + " \"input_config_dir\": \"/opt/ml/input/config\",\n", + " \"input_data_config\": {\n", + " \"train\": {\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " }\n", + " },\n", + " \"input_dir\": \"/opt/ml/input\",\n", + " \"is_master\": true,\n", + " \"job_name\": \"sagemaker-scikit-learn-2020-01-07-09-09-49-538\",\n", + " \"log_level\": 20,\n", + " \"master_hostname\": \"algo-1\",\n", + " \"model_dir\": \"/opt/ml/model\",\n", + " \"module_dir\": \"s3://sagemaker-us-east-1-309164732448/sagemaker-scikit-learn-2020-01-07-09-09-49-538/source/sourcedir.tar.gz\",\n", + " \"module_name\": \"train\",\n", + " \"network_interface_name\": \"eth0\",\n", + " \"num_cpus\": 4,\n", + " \"num_gpus\": 0,\n", + " \"output_data_dir\": \"/opt/ml/output/data\",\n", + " \"output_dir\": \"/opt/ml/output\",\n", + " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", + " \"resource_config\": {\n", + " \"current_host\": \"algo-1\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"network_interface_name\": \"eth0\"\n", + " },\n", + " \"user_entry_point\": \"train.py\"\u001b[0m\n", + "\u001b[34m}\n", + "\u001b[0m\n", + "\u001b[34mEnvironment variables:\n", + "\u001b[0m\n", + "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", + "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", + "\u001b[34mSM_HPS={}\u001b[0m\n", + "\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", + "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", + "\u001b[34mSM_INPUT_DATA_CONFIG={\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", + "\u001b[34mSM_CHANNELS=[\"train\"]\u001b[0m\n", + "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", + "\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", + "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_sklearn_container.training:main\u001b[0m\n", + "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", + "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", + "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", + "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", + "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", + "\u001b[34mSM_MODULE_DIR=s3://sagemaker-us-east-1-309164732448/sagemaker-scikit-learn-2020-01-07-09-09-49-538/source/sourcedir.tar.gz\u001b[0m\n", + "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"train\":\"/opt/ml/input/data/train\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_sklearn_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"sagemaker-scikit-learn-2020-01-07-09-09-49-538\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-us-east-1-309164732448/sagemaker-scikit-learn-2020-01-07-09-09-49-538/source/sourcedir.tar.gz\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", + "\u001b[34mSM_USER_ARGS=[]\u001b[0m\n", + "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", + "\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", + "\u001b[34mPYTHONPATH=/miniconda3/bin:/miniconda3/lib/python37.zip:/miniconda3/lib/python3.7:/miniconda3/lib/python3.7/lib-dynload:/miniconda3/lib/python3.7/site-packages\n", + "\u001b[0m\n", + "\u001b[34mInvoking script with the following command:\n", + "\u001b[0m\n", + "\u001b[34m/miniconda3/bin/python -m train\n", + "\n", + "\u001b[0m\n", + "\u001b[34m/miniconda3/lib/python3.7/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py:47: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", + " import imp\u001b[0m\n", + "\u001b[34m/miniconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", + " FutureWarning)\u001b[0m\n", + "\u001b[34m2020-01-07 09:12:49,705 sagemaker-containers INFO Reporting training SUCCESS\u001b[0m\n", + "\n", + "2020-01-07 09:13:10 Uploading - Uploading generated training model\n", + "2020-01-07 09:13:10 Completed - Training job completed\n", + "Training seconds: 71\n", + "Billable seconds: 71\n" + ] + } + ], + "source": [ + "estimator.fit({'train': input_data})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Deploy the trained model\n", + "\n", + "After training, deploy your model to create a `predictor`. If you're using a PyTorch model, you'll need to create a trained `PyTorchModel` that accepts the trained `.model_data` as an input parameter and points to the provided `source_pytorch/predict.py` file as an entry point. \n", + "\n", + "To deploy a trained model, you'll use `.deploy`, which takes in two arguments:\n", + "* **initial_instance_count**: The number of deployed instances (1).\n", + "* **instance_type**: The type of SageMaker instance for deployment.\n", + "\n", + "Note: If you run into an instance error, it may be because you chose the wrong training or deployment instance_type. It may help to refer to your previous exercise code to see which types of instances we used." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---------------------------------------------------------------------------------------------------!CPU times: user 520 ms, sys: 19.4 ms, total: 540 ms\n", + "Wall time: 8min 20s\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "# uncomment, if needed\n", + "# from sagemaker.pytorch import PyTorchModel\n", + "\n", + "\n", + "# deploy your model to create a predictor\n", + "predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "# Evaluating Your Model\n", + "\n", + "Once your model is deployed, you can see how it performs when applied to our test data.\n", + "\n", + "The provided cell below, reads in the test data, assuming it is stored locally in `data_dir` and named `test.csv`. The labels and features are extracted from the `.csv` file." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "import os\n", + "\n", + "# read in test data, assuming it is stored locally\n", + "test_data = pd.read_csv(os.path.join(data_dir, \"test.csv\"), header=None, names=None)\n", + "\n", + "# labels are in the first column\n", + "test_y = test_data.iloc[:,0]\n", + "test_x = test_data.iloc[:,1:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## EXERCISE: Determine the accuracy of your model\n", + "\n", + "Use your deployed `predictor` to generate predicted, class labels for the test data. Compare those to the *true* labels, `test_y`, and calculate the accuracy as a value between 0 and 1.0 that indicates the fraction of test data that your model classified correctly. You may use [sklearn.metrics](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics) for this calculation.\n", + "\n", + "**To pass this project, your model should get at least 90% test accuracy.**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test passed!\n" + ] + } + ], + "source": [ + "# First: generate predicted, class labels\n", + "test_y_preds = predictor.predict(test_x)\n", + "\n", + "\n", + "\"\"\"\n", + "DON'T MODIFY ANYTHING IN THIS CELL THAT IS BELOW THIS LINE\n", + "\"\"\"\n", + "# test that your model generates the correct number of labels\n", + "assert len(test_y_preds)==len(test_y), 'Unexpected number of predictions.'\n", + "print('Test passed!')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Recall: 1.0666666666666667\n", + "Precision: 1.0\n", + "Accuracy: 0.96\n", + "\n", + "Predicted class labels: \n", + "[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 0 0]\n", + "\n", + "True class labels: \n", + "[1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0]\n" + ] + } + ], + "source": [ + "# Second: calculate the test accuracy\n", + "accuracy_array = test_y_preds == test_y\n", + "count = 0\n", + "for element in accuracy_array:\n", + " if element == True:\n", + " count = count + 1\n", + "\n", + "false_positives = test_y_preds - accuracy_array\n", + "false_positive_count = false_positives.where(false_positives > 0, 0 )\n", + "accuracy = count/ len(accuracy_array)\n", + " \n", + "recall = test_y_preds.sum() / test_y.sum()\n", + "print('Recall: ', recall)\n", + "precision = test_y_preds.sum() / (test_y_preds.sum() + 0)\n", + "print('Precision: ', precision)\n", + "\n", + "print('Accuracy:', accuracy)\n", + "\n", + "\n", + "## print out the array of predicted and true labels, if you want\n", + "print('\\nPredicted class labels: ')\n", + "print(test_y_preds)\n", + "print('\\nTrue class labels: ')\n", + "print(test_y.values)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question 1: How many false positives and false negatives did your model produce, if any? And why do you think this is?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Answer**: My model didn't produce any false positives, and only 1 false negative. With an astounding accuracy of 94%, I think it happened because the dataset is small and hence we don't have many outliers.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question 2: How did you decide on the type of model to use? " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "** Answer**: The problem being a binary classification problem, with multiple numerical features, Linear Logistic Regression is one of the best models. It accepts n features and returns classification." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "## EXERCISE: Clean up Resources\n", + "\n", + "After you're done evaluating your model, **delete your model endpoint**. You can do this with a call to `.delete_endpoint()`. You need to show, in this notebook, that the endpoint was deleted. Any other resources, you may delete from the AWS console, and you will find more instructions on cleaning up all your resources, below." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# uncomment and fill in the line below!\n", + "predictor.delete_endpoint()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Deleting S3 bucket\n", + "\n", + "When you are *completely* done with training and testing models, you can also delete your entire S3 bucket. If you do this before you are done training your model, you'll have to recreate your S3 bucket and upload your training data again." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'ResponseMetadata': {'RequestId': 'FF46362CF1FCE014',\n", + " 'HostId': 'CpqWvNa0rhrelGsFOAYQFukvUQ8xvAhiB9EB24l472MTvPhr1ykHsAX/Gr8a+lSIjlL1Jfoz9fg=',\n", + " 'HTTPStatusCode': 200,\n", + " 'HTTPHeaders': {'x-amz-id-2': 'CpqWvNa0rhrelGsFOAYQFukvUQ8xvAhiB9EB24l472MTvPhr1ykHsAX/Gr8a+lSIjlL1Jfoz9fg=',\n", + " 'x-amz-request-id': 'FF46362CF1FCE014',\n", + " 'date': 'Tue, 07 Jan 2020 09:21:53 GMT',\n", + " 'connection': 'close',\n", + " 'content-type': 'application/xml',\n", + " 'transfer-encoding': 'chunked',\n", + " 'server': 'AmazonS3'},\n", + " 'RetryAttempts': 0},\n", + " 'Deleted': [{'Key': 'sagemaker-scikit-learn-2020-01-07-09-01-41-326/output/model.tar.gz'},\n", + " {'Key': 'sagemaker/plagiarism_detector/test.csv'},\n", + " {'Key': 'sagemaker/plagiarism_detector/train.csv'},\n", + " {'Key': 'sagemaker-scikit-learn-2020-01-07-09-09-49-538/debug-output/training_job_end.ts'},\n", + " {'Key': 'sagemaker-scikit-learn-2020-01-07-09-09-49-538/output/model.tar.gz'},\n", + " {'Key': 'sagemaker-scikit-learn-2020-01-07-09-09-49-538/source/sourcedir.tar.gz'},\n", + " {'Key': 'sagemaker-scikit-learn-2020-01-07-09-01-41-326/source/sourcedir.tar.gz'},\n", + " {'Key': 'sagemaker-scikit-learn-2020-01-07-09-01-41-326/debug-output/training_job_end.ts'}]}]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#deleting bucket, uncomment lines below\n", + "\n", + "bucket_to_delete = boto3.resource('s3').Bucket(bucket)\n", + "bucket_to_delete.objects.all().delete()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Deleting all your models and instances\n", + "\n", + "When you are _completely_ done with this project and do **not** ever want to revisit this notebook, you can choose to delete all of your SageMaker notebook instances and models by following [these instructions](https://docs.aws.amazon.com/sagemaker/latest/dg/ex1-cleanup.html). Before you delete this notebook instance, I recommend at least downloading a copy and saving it, locally." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Further Directions\n", + "\n", + "There are many ways to improve or add on to this project to expand your learning or make this more of a unique project for you. A few ideas are listed below:\n", + "* Train a classifier to predict the *category* (1-3) of plagiarism and not just plagiarized (1) or not (0).\n", + "* Utilize a different and larger dataset to see if this model can be extended to other types of plagiarism.\n", + "* Use language or character-level analysis to find different (and more) similarity features.\n", + "* Write a complete pipeline function that accepts a source text and submitted text file, and classifies the submitted text as plagiarized or not.\n", + "* Use API Gateway and a lambda function to deploy your model to a web application.\n", + "\n", + "These are all just options for extending your work. If you've completed all the exercises in this notebook, you've completed a real-world application, and can proceed to submit your project. Great job!" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_pytorch_p36", + "language": "python", + "name": "conda_pytorch_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..c7d406f --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +# Plagiarism Project, Machine Learning Deployment + +This repository contains code and associated files for deploying a plagiarism detector using AWS SageMaker. + +## Project Overview + +In this project, you will be tasked with building a plagiarism detector that examines a text file and performs binary classification; labeling that file as either *plagiarized* or *not*, depending on how similar that text file is to a provided source text. Detecting plagiarism is an active area of research; the task is non-trivial and the differences between paraphrased answers and original work are often not so obvious. + +This project will be broken down into three main notebooks: + +**Notebook 1: Data Exploration** +* Load in the corpus of plagiarism text data. +* Explore the existing data features and the data distribution. +* This first notebook is **not** required in your final project submission. + +**Notebook 2: Feature Engineering** + +* Clean and pre-process the text data. +* Define features for comparing the similarity of an answer text and a source text, and extract similarity features. +* Select "good" features, by analyzing the correlations between different features. +* Create train/test `.csv` files that hold the relevant features and class labels for train/test data points. + +**Notebook 3: Train and Deploy Your Model in SageMaker** + +* Upload your train/test feature data to S3. +* Define a binary classification model and a training script. +* Train your model and deploy it using SageMaker. +* Evaluate your deployed classifier. + +--- + +Please see the [README](https://github.com/udacity/ML_SageMaker_Studies/tree/master/README.md) in the root directory for instructions on setting up a SageMaker notebook and downloading the project files (as well as the other notebooks). + diff --git a/__MACOSX/._data b/__MACOSX/._data new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/._data differ diff --git a/__MACOSX/data/._.DS_Store b/__MACOSX/data/._.DS_Store new file mode 100644 index 0000000..09fa6bd Binary files /dev/null and b/__MACOSX/data/._.DS_Store differ diff --git a/__MACOSX/data/._file_information.csv b/__MACOSX/data/._file_information.csv new file mode 100644 index 0000000..bd2b377 Binary files /dev/null and b/__MACOSX/data/._file_information.csv differ diff --git a/__MACOSX/data/._g0pA_taska.txt b/__MACOSX/data/._g0pA_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pA_taska.txt differ diff --git a/__MACOSX/data/._g0pA_taskb.txt b/__MACOSX/data/._g0pA_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pA_taskb.txt differ diff --git a/__MACOSX/data/._g0pA_taskc.txt b/__MACOSX/data/._g0pA_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pA_taskc.txt differ diff --git a/__MACOSX/data/._g0pA_taskd.txt b/__MACOSX/data/._g0pA_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pA_taskd.txt differ diff --git a/__MACOSX/data/._g0pA_taske.txt b/__MACOSX/data/._g0pA_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pA_taske.txt differ diff --git a/__MACOSX/data/._g0pB_taska.txt b/__MACOSX/data/._g0pB_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pB_taska.txt differ diff --git a/__MACOSX/data/._g0pB_taskb.txt b/__MACOSX/data/._g0pB_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pB_taskb.txt differ diff --git a/__MACOSX/data/._g0pB_taskc.txt b/__MACOSX/data/._g0pB_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pB_taskc.txt differ diff --git a/__MACOSX/data/._g0pB_taskd.txt b/__MACOSX/data/._g0pB_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pB_taskd.txt differ diff --git a/__MACOSX/data/._g0pB_taske.txt b/__MACOSX/data/._g0pB_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pB_taske.txt differ diff --git a/__MACOSX/data/._g0pC_taska.txt b/__MACOSX/data/._g0pC_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pC_taska.txt differ diff --git a/__MACOSX/data/._g0pC_taskb.txt b/__MACOSX/data/._g0pC_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pC_taskb.txt differ diff --git a/__MACOSX/data/._g0pC_taskc.txt b/__MACOSX/data/._g0pC_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pC_taskc.txt differ diff --git a/__MACOSX/data/._g0pC_taskd.txt b/__MACOSX/data/._g0pC_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pC_taskd.txt differ diff --git a/__MACOSX/data/._g0pC_taske.txt b/__MACOSX/data/._g0pC_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pC_taske.txt differ diff --git a/__MACOSX/data/._g0pD_taska.txt b/__MACOSX/data/._g0pD_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pD_taska.txt differ diff --git a/__MACOSX/data/._g0pD_taskb.txt b/__MACOSX/data/._g0pD_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pD_taskb.txt differ diff --git a/__MACOSX/data/._g0pD_taskc.txt b/__MACOSX/data/._g0pD_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pD_taskc.txt differ diff --git a/__MACOSX/data/._g0pD_taskd.txt b/__MACOSX/data/._g0pD_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pD_taskd.txt differ diff --git a/__MACOSX/data/._g0pD_taske.txt b/__MACOSX/data/._g0pD_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pD_taske.txt differ diff --git a/__MACOSX/data/._g0pE_taska.txt b/__MACOSX/data/._g0pE_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pE_taska.txt differ diff --git a/__MACOSX/data/._g0pE_taskb.txt b/__MACOSX/data/._g0pE_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pE_taskb.txt differ diff --git a/__MACOSX/data/._g0pE_taskc.txt b/__MACOSX/data/._g0pE_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pE_taskc.txt differ diff --git a/__MACOSX/data/._g0pE_taskd.txt b/__MACOSX/data/._g0pE_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pE_taskd.txt differ diff --git a/__MACOSX/data/._g0pE_taske.txt b/__MACOSX/data/._g0pE_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g0pE_taske.txt differ diff --git a/__MACOSX/data/._g1pA_taska.txt b/__MACOSX/data/._g1pA_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pA_taska.txt differ diff --git a/__MACOSX/data/._g1pA_taskb.txt b/__MACOSX/data/._g1pA_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pA_taskb.txt differ diff --git a/__MACOSX/data/._g1pA_taskc.txt b/__MACOSX/data/._g1pA_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pA_taskc.txt differ diff --git a/__MACOSX/data/._g1pA_taskd.txt b/__MACOSX/data/._g1pA_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pA_taskd.txt differ diff --git a/__MACOSX/data/._g1pA_taske.txt b/__MACOSX/data/._g1pA_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pA_taske.txt differ diff --git a/__MACOSX/data/._g1pB_taska.txt b/__MACOSX/data/._g1pB_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pB_taska.txt differ diff --git a/__MACOSX/data/._g1pB_taskb.txt b/__MACOSX/data/._g1pB_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pB_taskb.txt differ diff --git a/__MACOSX/data/._g1pB_taskc.txt b/__MACOSX/data/._g1pB_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pB_taskc.txt differ diff --git a/__MACOSX/data/._g1pB_taskd.txt b/__MACOSX/data/._g1pB_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pB_taskd.txt differ diff --git a/__MACOSX/data/._g1pB_taske.txt b/__MACOSX/data/._g1pB_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pB_taske.txt differ diff --git a/__MACOSX/data/._g1pD_taska.txt b/__MACOSX/data/._g1pD_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pD_taska.txt differ diff --git a/__MACOSX/data/._g1pD_taskb.txt b/__MACOSX/data/._g1pD_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pD_taskb.txt differ diff --git a/__MACOSX/data/._g1pD_taskc.txt b/__MACOSX/data/._g1pD_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pD_taskc.txt differ diff --git a/__MACOSX/data/._g1pD_taskd.txt b/__MACOSX/data/._g1pD_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pD_taskd.txt differ diff --git a/__MACOSX/data/._g1pD_taske.txt b/__MACOSX/data/._g1pD_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g1pD_taske.txt differ diff --git a/__MACOSX/data/._g2pA_taska.txt b/__MACOSX/data/._g2pA_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pA_taska.txt differ diff --git a/__MACOSX/data/._g2pA_taskb.txt b/__MACOSX/data/._g2pA_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pA_taskb.txt differ diff --git a/__MACOSX/data/._g2pA_taskc.txt b/__MACOSX/data/._g2pA_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pA_taskc.txt differ diff --git a/__MACOSX/data/._g2pA_taskd.txt b/__MACOSX/data/._g2pA_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pA_taskd.txt differ diff --git a/__MACOSX/data/._g2pA_taske.txt b/__MACOSX/data/._g2pA_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pA_taske.txt differ diff --git a/__MACOSX/data/._g2pB_taska.txt b/__MACOSX/data/._g2pB_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pB_taska.txt differ diff --git a/__MACOSX/data/._g2pB_taskb.txt b/__MACOSX/data/._g2pB_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pB_taskb.txt differ diff --git a/__MACOSX/data/._g2pB_taskc.txt b/__MACOSX/data/._g2pB_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pB_taskc.txt differ diff --git a/__MACOSX/data/._g2pB_taskd.txt b/__MACOSX/data/._g2pB_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pB_taskd.txt differ diff --git a/__MACOSX/data/._g2pB_taske.txt b/__MACOSX/data/._g2pB_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pB_taske.txt differ diff --git a/__MACOSX/data/._g2pC_taska.txt b/__MACOSX/data/._g2pC_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pC_taska.txt differ diff --git a/__MACOSX/data/._g2pC_taskb.txt b/__MACOSX/data/._g2pC_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pC_taskb.txt differ diff --git a/__MACOSX/data/._g2pC_taskc.txt b/__MACOSX/data/._g2pC_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pC_taskc.txt differ diff --git a/__MACOSX/data/._g2pC_taskd.txt b/__MACOSX/data/._g2pC_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pC_taskd.txt differ diff --git a/__MACOSX/data/._g2pC_taske.txt b/__MACOSX/data/._g2pC_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pC_taske.txt differ diff --git a/__MACOSX/data/._g2pE_taska.txt b/__MACOSX/data/._g2pE_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pE_taska.txt differ diff --git a/__MACOSX/data/._g2pE_taskb.txt b/__MACOSX/data/._g2pE_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pE_taskb.txt differ diff --git a/__MACOSX/data/._g2pE_taskc.txt b/__MACOSX/data/._g2pE_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pE_taskc.txt differ diff --git a/__MACOSX/data/._g2pE_taskd.txt b/__MACOSX/data/._g2pE_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pE_taskd.txt differ diff --git a/__MACOSX/data/._g2pE_taske.txt b/__MACOSX/data/._g2pE_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g2pE_taske.txt differ diff --git a/__MACOSX/data/._g3pA_taska.txt b/__MACOSX/data/._g3pA_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pA_taska.txt differ diff --git a/__MACOSX/data/._g3pA_taskb.txt b/__MACOSX/data/._g3pA_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pA_taskb.txt differ diff --git a/__MACOSX/data/._g3pA_taskc.txt b/__MACOSX/data/._g3pA_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pA_taskc.txt differ diff --git a/__MACOSX/data/._g3pA_taskd.txt b/__MACOSX/data/._g3pA_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pA_taskd.txt differ diff --git a/__MACOSX/data/._g3pA_taske.txt b/__MACOSX/data/._g3pA_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pA_taske.txt differ diff --git a/__MACOSX/data/._g3pB_taska.txt b/__MACOSX/data/._g3pB_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pB_taska.txt differ diff --git a/__MACOSX/data/._g3pB_taskb.txt b/__MACOSX/data/._g3pB_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pB_taskb.txt differ diff --git a/__MACOSX/data/._g3pB_taskc.txt b/__MACOSX/data/._g3pB_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pB_taskc.txt differ diff --git a/__MACOSX/data/._g3pB_taskd.txt b/__MACOSX/data/._g3pB_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pB_taskd.txt differ diff --git a/__MACOSX/data/._g3pB_taske.txt b/__MACOSX/data/._g3pB_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pB_taske.txt differ diff --git a/__MACOSX/data/._g3pC_taska.txt b/__MACOSX/data/._g3pC_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pC_taska.txt differ diff --git a/__MACOSX/data/._g3pC_taskb.txt b/__MACOSX/data/._g3pC_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pC_taskb.txt differ diff --git a/__MACOSX/data/._g3pC_taskc.txt b/__MACOSX/data/._g3pC_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pC_taskc.txt differ diff --git a/__MACOSX/data/._g3pC_taskd.txt b/__MACOSX/data/._g3pC_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pC_taskd.txt differ diff --git a/__MACOSX/data/._g3pC_taske.txt b/__MACOSX/data/._g3pC_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g3pC_taske.txt differ diff --git a/__MACOSX/data/._g4pB_taska.txt b/__MACOSX/data/._g4pB_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pB_taska.txt differ diff --git a/__MACOSX/data/._g4pB_taskb.txt b/__MACOSX/data/._g4pB_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pB_taskb.txt differ diff --git a/__MACOSX/data/._g4pB_taskc.txt b/__MACOSX/data/._g4pB_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pB_taskc.txt differ diff --git a/__MACOSX/data/._g4pB_taskd.txt b/__MACOSX/data/._g4pB_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pB_taskd.txt differ diff --git a/__MACOSX/data/._g4pB_taske.txt b/__MACOSX/data/._g4pB_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pB_taske.txt differ diff --git a/__MACOSX/data/._g4pC_taska.txt b/__MACOSX/data/._g4pC_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pC_taska.txt differ diff --git a/__MACOSX/data/._g4pC_taskb.txt b/__MACOSX/data/._g4pC_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pC_taskb.txt differ diff --git a/__MACOSX/data/._g4pC_taskc.txt b/__MACOSX/data/._g4pC_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pC_taskc.txt differ diff --git a/__MACOSX/data/._g4pC_taskd.txt b/__MACOSX/data/._g4pC_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pC_taskd.txt differ diff --git a/__MACOSX/data/._g4pC_taske.txt b/__MACOSX/data/._g4pC_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pC_taske.txt differ diff --git a/__MACOSX/data/._g4pD_taska.txt b/__MACOSX/data/._g4pD_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pD_taska.txt differ diff --git a/__MACOSX/data/._g4pD_taskb.txt b/__MACOSX/data/._g4pD_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pD_taskb.txt differ diff --git a/__MACOSX/data/._g4pD_taskc.txt b/__MACOSX/data/._g4pD_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pD_taskc.txt differ diff --git a/__MACOSX/data/._g4pD_taskd.txt b/__MACOSX/data/._g4pD_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pD_taskd.txt differ diff --git a/__MACOSX/data/._g4pD_taske.txt b/__MACOSX/data/._g4pD_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pD_taske.txt differ diff --git a/__MACOSX/data/._g4pE_taska.txt b/__MACOSX/data/._g4pE_taska.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pE_taska.txt differ diff --git a/__MACOSX/data/._g4pE_taskb.txt b/__MACOSX/data/._g4pE_taskb.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pE_taskb.txt differ diff --git a/__MACOSX/data/._g4pE_taskc.txt b/__MACOSX/data/._g4pE_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pE_taskc.txt differ diff --git a/__MACOSX/data/._g4pE_taskd.txt b/__MACOSX/data/._g4pE_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pE_taskd.txt differ diff --git a/__MACOSX/data/._g4pE_taske.txt b/__MACOSX/data/._g4pE_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._g4pE_taske.txt differ diff --git a/__MACOSX/data/._orig_taska.txt b/__MACOSX/data/._orig_taska.txt new file mode 100644 index 0000000..409fb1c Binary files /dev/null and b/__MACOSX/data/._orig_taska.txt differ diff --git a/__MACOSX/data/._orig_taskc.txt b/__MACOSX/data/._orig_taskc.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._orig_taskc.txt differ diff --git a/__MACOSX/data/._orig_taskd.txt b/__MACOSX/data/._orig_taskd.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._orig_taskd.txt differ diff --git a/__MACOSX/data/._orig_taske.txt b/__MACOSX/data/._orig_taske.txt new file mode 100644 index 0000000..2b46a14 Binary files /dev/null and b/__MACOSX/data/._orig_taske.txt differ diff --git a/__MACOSX/data/._test_info.csv b/__MACOSX/data/._test_info.csv new file mode 100644 index 0000000..c9407dd Binary files /dev/null and b/__MACOSX/data/._test_info.csv differ diff --git a/__pycache__/helpers.cpython-36.pyc b/__pycache__/helpers.cpython-36.pyc new file mode 100644 index 0000000..03db8c2 Binary files /dev/null and b/__pycache__/helpers.cpython-36.pyc differ diff --git a/__pycache__/problem_unittests.cpython-36.pyc b/__pycache__/problem_unittests.cpython-36.pyc new file mode 100644 index 0000000..0045b88 Binary files /dev/null and b/__pycache__/problem_unittests.cpython-36.pyc differ diff --git a/data.zip b/data.zip new file mode 100644 index 0000000..2abe24e Binary files /dev/null and b/data.zip differ diff --git a/data/.DS_Store b/data/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/data/.DS_Store differ diff --git a/data/file_information.csv b/data/file_information.csv new file mode 100644 index 0000000..e713b2f --- /dev/null +++ b/data/file_information.csv @@ -0,0 +1,101 @@ +File,Task,Category +g0pA_taska.txt,a,non +g0pA_taskb.txt,b,cut +g0pA_taskc.txt,c,light +g0pA_taskd.txt,d,heavy +g0pA_taske.txt,e,non +g0pB_taska.txt,a,non +g0pB_taskb.txt,b,non +g0pB_taskc.txt,c,cut +g0pB_taskd.txt,d,light +g0pB_taske.txt,e,heavy +g0pC_taska.txt,a,heavy +g0pC_taskb.txt,b,non +g0pC_taskc.txt,c,non +g0pC_taskd.txt,d,cut +g0pC_taske.txt,e,light +g0pD_taska.txt,a,cut +g0pD_taskb.txt,b,light +g0pD_taskc.txt,c,heavy +g0pD_taskd.txt,d,non +g0pD_taske.txt,e,non +g0pE_taska.txt,a,light +g0pE_taskb.txt,b,heavy +g0pE_taskc.txt,c,non +g0pE_taskd.txt,d,non +g0pE_taske.txt,e,cut +g1pA_taska.txt,a,non +g1pA_taskb.txt,b,heavy +g1pA_taskc.txt,c,light +g1pA_taskd.txt,d,cut +g1pA_taske.txt,e,non +g1pB_taska.txt,a,non +g1pB_taskb.txt,b,non +g1pB_taskc.txt,c,heavy +g1pB_taskd.txt,d,light +g1pB_taske.txt,e,cut +g1pD_taska.txt,a,light +g1pD_taskb.txt,b,cut +g1pD_taskc.txt,c,non +g1pD_taskd.txt,d,non +g1pD_taske.txt,e,heavy +g2pA_taska.txt,a,non +g2pA_taskb.txt,b,heavy +g2pA_taskc.txt,c,light +g2pA_taskd.txt,d,cut +g2pA_taske.txt,e,non +g2pB_taska.txt,a,non +g2pB_taskb.txt,b,non +g2pB_taskc.txt,c,heavy +g2pB_taskd.txt,d,light +g2pB_taske.txt,e,cut +g2pC_taska.txt,a,cut +g2pC_taskb.txt,b,non +g2pC_taskc.txt,c,non +g2pC_taskd.txt,d,heavy +g2pC_taske.txt,e,light +g2pE_taska.txt,a,heavy +g2pE_taskb.txt,b,light +g2pE_taskc.txt,c,cut +g2pE_taskd.txt,d,non +g2pE_taske.txt,e,non +g3pA_taska.txt,a,non +g3pA_taskb.txt,b,heavy +g3pA_taskc.txt,c,light +g3pA_taskd.txt,d,cut +g3pA_taske.txt,e,non +g3pB_taska.txt,a,non +g3pB_taskb.txt,b,non +g3pB_taskc.txt,c,heavy +g3pB_taskd.txt,d,light +g3pB_taske.txt,e,cut +g3pC_taska.txt,a,cut +g3pC_taskb.txt,b,non +g3pC_taskc.txt,c,non +g3pC_taskd.txt,d,heavy +g3pC_taske.txt,e,light +g4pB_taska.txt,a,non +g4pB_taskb.txt,b,non +g4pB_taskc.txt,c,heavy +g4pB_taskd.txt,d,light +g4pB_taske.txt,e,cut +g4pC_taska.txt,a,cut +g4pC_taskb.txt,b,non +g4pC_taskc.txt,c,non +g4pC_taskd.txt,d,heavy +g4pC_taske.txt,e,light +g4pD_taska.txt,a,light +g4pD_taskb.txt,b,cut +g4pD_taskc.txt,c,non +g4pD_taskd.txt,d,non +g4pD_taske.txt,e,heavy +g4pE_taska.txt,a,heavy +g4pE_taskb.txt,b,light +g4pE_taskc.txt,c,cut +g4pE_taskd.txt,d,non +g4pE_taske.txt,e,non +orig_taska.txt,a,orig +orig_taskb.txt,b,orig +orig_taskc.txt,c,orig +orig_taskd.txt,d,orig +orig_taske.txt,e,orig \ No newline at end of file diff --git a/data/g0pA_taska.txt b/data/g0pA_taska.txt new file mode 100755 index 0000000..9fcd2cd --- /dev/null +++ b/data/g0pA_taska.txt @@ -0,0 +1,22 @@ +Inheritance is a basic concept of Object-Oriented Programming where +the basic idea is to create new classes that add extra detail to +existing classes. This is done by allowing the new classes to reuse +the methods and variables of the existing classes and new methods and +classes are added to specialise the new class. Inheritance models the +“is-kind-of” relationship between entities (or objects), for example, +postgraduates and undergraduates are both kinds of student. This kind +of relationship can be visualised as a tree structure, where ‘student’ +would be the more general root node and both ‘postgraduate’ and +‘undergraduate’ would be more specialised extensions of the ‘student’ +node (or the child nodes). In this relationship ‘student’ would be +known as the superclass or parent class whereas, ‘postgraduate’ would +be known as the subclass or child class because the ‘postgraduate’ +class extends the ‘student’ class. + +Inheritance can occur on several layers, where if visualised would +display a larger tree structure. For example, we could further extend +the ‘postgraduate’ node by adding two extra extended classes to it +called, ‘MSc Student’ and ‘PhD Student’ as both these types of student +are kinds of postgraduate student. This would mean that both the ‘MSc +Student’ and ‘PhD Student’ classes would inherit methods and variables +from both the ‘postgraduate’ and ‘student classes’. diff --git a/data/g0pA_taskb.txt b/data/g0pA_taskb.txt new file mode 100755 index 0000000..971e3f3 --- /dev/null +++ b/data/g0pA_taskb.txt @@ -0,0 +1,5 @@ +PageRank is a link analysis algorithm used by the Google Internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents, such as the World Wide Web, with the purpose of "measuring" its relative importance within the set. Google assigns a numeric weighting from 0-10 for each webpage on the Internet; this PageRank? denotes a site’s importance in the eyes of Google. + +The PageRank? is derived from a theoretical probability value on a logarithmic scale like the Richter Scale. The PageRank? of a particular page is roughly based upon the quantity of inbound links as well as the PageRank? of the pages providing the links. The algorithm may be applied to any collection of entities with reciprocal quotations and references. The numerical weight that it assigns to any given element E is also called the PageRank? of E and denoted by PR(E). + +It is known that other factors, e.g. relevance of search words on the page and actual visits to the page reported by the Google toolbar also influence the PageRank?. Other link-based ranking algorithms for Web pages include the HITS algorithm invented by Jon Kleinberg (used by Teoma and now Ask.com), the IBM CLEVER project, and the TrustRank? algorithm. diff --git a/data/g0pA_taskc.txt b/data/g0pA_taskc.txt new file mode 100755 index 0000000..4b9b3fa --- /dev/null +++ b/data/g0pA_taskc.txt @@ -0,0 +1,7 @@ +The vector space model (also called, term vector model) is an algebraic model used to represent text documents, as well as any objects in general, as vectors of identifiers. It is used in information retrieval and was first used in the SMART Information Retrieval System. + +A document is represented as a vector and each dimension corresponds to a separate term. If a term appears in the document then its value in the vector is non-zero. Many different ways of calculating these values, also known as (term) weights, have been developed. One of the best known methods is called tf-idf weighting. + +The definition of term depends on the application but generally terms are single words, keywords, or longer phrases. If the words are chosen to be the terms, the dimensionality of the vector is the number of words in the vocabulary, which is the number of distinct words occurring in the corpus. + +The vector space model has several disadvantages. Firstly, long documents are represented badly because they have poor similarity values. Secondly, search keywords must accurately match document terms and substrings of words might result in a "false-positive match". Thirdly, documents with similar context but different term vocabulary will not be associated, resulting in a "false-negative match". Finally, the order in which the terms appear in the document is lost in the vector space representation. diff --git a/data/g0pA_taskd.txt b/data/g0pA_taskd.txt new file mode 100755 index 0000000..d17f798 --- /dev/null +++ b/data/g0pA_taskd.txt @@ -0,0 +1,21 @@ +Bayes’ theorem was names after Rev Thomas Bayes and is a method used +in probability theory. This theorem aims to relate the conditional and +marginal probabilities of two random events occuring, and given +various observations is frequently used to compute subsequent +probabilities. Bayes’ theorem is also often known as Bayes’ law. + +An example of where Bayes’ theorem may be used is in the following +extract: “Suppose there exists a school with forty percent females and +sixty percent males as students. The female students can only wear +skirts or trousers in equal numbers whereas all the male students can +only wear trousers. An observer randomly sees a student from a +distance and all he can see is that this student is wearing +trousers. What is the probability this student is female?” + +There is a debate amongst frequentists and Bayesians about how Bayes’ +theorem plays a major role around the beginnings of statistical +mathematics. Frequentist and Bayesian explanations do not agree about +the ways in which probabilities should be assigned. This is primarily +because Bayesians assign probabilities in terms of beliefs whereas +frequentists assign probabilities to random events according to the +frequencies of them occurring. diff --git a/data/g0pA_taske.txt b/data/g0pA_taske.txt new file mode 100755 index 0000000..52c8eb0 --- /dev/null +++ b/data/g0pA_taske.txt @@ -0,0 +1,15 @@ +Dynamic Programming is an algorithm design technique used for optimisation problems, such as minimising or maximising. Like divide and conquer, Dynamic Programming solves problems by combining solutions to sub-problems. However, unlike divide and conquer, sub-problems are not always independent as sub-problems may share sub-sub-problems but solution to one sub-problem may not affect the solutions to other sub-problems of the same problem. + +There are four steps in Dynamic Programming: + +1. Characterise structure of an optimal solution. + +2. Define value of optimal solution recursively. + +3. Compute optimal solution values either top-down with caching or bottom-up in a table. + +4. Construct an optimal solution from computed values. + +An example of the type of problem for which Dynamic Programming may be used is: given two sequences, X=(x1,...,xm) and Y=(y1,...,yn) find a common subsequence whose length is maximum. + +Dynamic Programming reduces computation by solving sub-problems in a bottom-up fashion and by storing solution to a sub-problem the first time it is solved. Also, looking up the solution when a sub-problem is encountered again helps reduce computation. However, the key in Dynamic Programming is to determine the structure of optimal solutions. diff --git a/data/g0pB_taska.txt b/data/g0pB_taska.txt new file mode 100755 index 0000000..aba8fc9 --- /dev/null +++ b/data/g0pB_taska.txt @@ -0,0 +1,33 @@ +Inheritance is a basic concept in object oriented programming. It models the reuse of existing class code in new classes – the “is a kind of” relationship. + +For example, a house is a kind of building; similarly, an office block is a kind of building. Both house and office block will inherit certain characteristics from buildings, but also have their own personal characteristics – a house may have a number of occupants, whereas an office block will have a number of offices. However, these personal characteristics don't apply to all types of buildings. + +In this example, the building would be considered the superclass – it contains general characteristics for other objects to inherit – and the house and office block are both subclasses – they are specific types and specialise the characteristics of the superclass. + +Java allows object inheritance. When one class inherits from another class, all the public variables and methods are available to the subclass. + +public class Shape { + + private Color colour; + + public void setColour(Color newColour){ + + colour = newColour; + + } + +} + +public class Circle extends Shape { + + private int radius; + + public void setRadius(int newRadius){ + + radius = newRadius; + + } + +} + +In this example, the Circle class is a subclass of the Shape class. The Shape class provides a public setColour method, which will be available to the Circle class and other subclasses of Shape. However, the private variable colour (as defined in the Shape class) will not be available for direct manipulation by the Circle class because it is not inherited. The Circle class specialises the Shape class, which means that setRadius is available to the Circle class and all subclasses of Circle, but it isn't available to the superclass Shape. diff --git a/data/g0pB_taskb.txt b/data/g0pB_taskb.txt new file mode 100755 index 0000000..5ca9489 --- /dev/null +++ b/data/g0pB_taskb.txt @@ -0,0 +1,26 @@ +PageRank (PR) refers to both the concept and the Google system used +for ranking the importance of pages on the web. The “PageRank” of a +site refers to its importance or value on the web in relation to the +rest of the sites that have been “PageRank”ed. + +The algorithm basically works like a popularity contest – if your site +is linked to by popular websites, then your site is considered more +popular. However, the PR doesn't just apply to the website as a whole +– different pages within a website get given different PRs dependent +on a number of factors: + +* Inbound links (backlinks) – how many pages (other than the ones on your website) link to this particular page + +* Outbound links (forward links) – how many external pages the particular page links to + +* Dangling links – how many pages with no external links are linked to from a particular page + +* Deep links – how many links that are not the home page are linked to from a particular page + +PR tries to emulate a “random surfer”. The algorithm includes a +dampening factor, which is the probability that a random surfer will +get bored and go and visit a new page - by default, this is 0.85. A +variation on this is the “intentional surfer”, where the importance of +a page is based on the actual visits to sites by users. This method is +used in the Google Toolbar, which reports back actual site visits to +Google. diff --git a/data/g0pB_taskc.txt b/data/g0pB_taskc.txt new file mode 100755 index 0000000..cf58195 --- /dev/null +++ b/data/g0pB_taskc.txt @@ -0,0 +1,14 @@ +Vector space model is an algebraic model for representing text documents (and in general, any objects) as vectors of identifiers, such as, for example, index terms. Its first use was in the SMART Information Retrieval System. It is used in information filtering, information retrieval, indexing and relevancy rankings. + +A document is represented as a vector, and each dimension corresponds to a separate term. If a term occurs in the document, its value in the vector is non-zero. Several different ways of computing these values, also known as (term) weights, have been developed. The definition of term depends on the application. Typically terms are single words, keywords, or longer phrases. If the words are chosen to be the terms, the dimensionality of the vector is the number of words in the vocabulary (the number of distinct words occurring in the corpus). + +One of the best known schemes is tf-idf weighting, proposed by Salton, Wong and Yang. In the classic vector space model, the term specific weights in the document vectors are products of local and global parameters. + +Relevancy rankings of documents in a keyword search can be calculated, using the assumptions of document similarities theory, by comparing the deviation of angles between each document vector and the original query vector where the query is represented as same kind of vector as the documents. + +The vector space model has the following limitations: + + * Search keywords must precisely match document terms; word substrings might result in a "false positive match"; + * Semantic sensitivity; documents with similar context but different term vocabulary won't be associated, resulting in a "false negative match"; + * The order in which the terms appear in the document is lost in the vector space representation; + * Long documents are poorly represented because they have poor similarity values (a small scalar product and a large dimensionality). diff --git a/data/g0pB_taskd.txt b/data/g0pB_taskd.txt new file mode 100755 index 0000000..350d117 --- /dev/null +++ b/data/g0pB_taskd.txt @@ -0,0 +1,26 @@ +Bayes' theorem relates the conditional and marginal probabilities of +two random events. For example, a person may be seen to have certain +medical symptoms; Bayes' theorem can then be used to compute the +probability that, given that observation, the proposed diagnosis is +the right one. + +Bayes' theorem forms a relationship between the probabilities xcof +events A and B. Intuitively, Bayes' theorem in this form describes the +way in which one's recognition of 'A' are updated by having observed +'B'. + +P(A | B) = P(B | A) P(A) / P(B) + +P(A|B) is the conditional probability of A given B. It is derived from or depends upon the specified value of B, therefore it is also known as the posterior probability. + +P(B|A) is the conditional probability of B given A. + +P(A) is the prior probability A. It doesn't take into account any information about B, so it is "prior". + +P(B) is the prior or marginal probability of B, and acts to normalise the probability. + +To derive the theorem, we begin with the definition of conditional +probability. By combining and re-arranging these two equations for A +and B, we get a the lemma called product rule for +probabilities. Provided that P(B) is not a zero, dividing both sides +by P(B) renders us with Bayes' theorem. diff --git a/data/g0pB_taske.txt b/data/g0pB_taske.txt new file mode 100755 index 0000000..f6affd2 --- /dev/null +++ b/data/g0pB_taske.txt @@ -0,0 +1,11 @@ +Dynamic programming is a method for solving mathematical programming problems that exhibit the properties of overlapping subproblems and optimal substructure. This is a much quicker method than other more naive methods. The word "programming" in "dynamic programming" relates optimization, which is commonly referred to as mathematical programming. Richard Bellman originally coined the term in the 1940s to describe a method for solving problems where one needs to find the best decisions one after another, and by 1953, he refined his method to the current modern meaning. + +Optimal substructure means that by splitting the programming into optimal solutions of subproblems, these can then be used to find the optimal solutions of the overall problem. One example is the computing of the shortest path to a goal from a vertex in a graph. First, compute the shortest path to the goal from all adjacent vertices. Then, using this, the best overall path can be found, thereby demonstrating the dynamic programming principle. This general three-step process can be used to solve a problem: + +1. Break up the problem different smaller subproblems. + +2. Recursively use this three-step process to compute the optimal path in the subproblem. + +3. Construct an optimal solution, using the computed optimal subproblems, for the original problem. + +This process continues recursively, working over the subproblems by dividing them into sub-subproblems and so forth, until a simple case is reached (one that is easily solvable). diff --git a/data/g0pC_taska.txt b/data/g0pC_taska.txt new file mode 100755 index 0000000..2cd8f67 --- /dev/null +++ b/data/g0pC_taska.txt @@ -0,0 +1,7 @@ +inheritance in object oriented programming is where a new class is formed using classes which have allready been defined. These classes have have some of the behavior and attributes which where existent in the classes that it inherited from. The peropos of inheritance in object oriented programming is to minimize the reuse of existing code without modification. + +Inheritance allowes classes to be categorized, similer to the way humans catagorize. It also provides a way to generalize du to the "is a" relationship between classes. For example a "cow" is a generalization of "animal" similarly so are "pigs" & cheaters". Defeining classes in this way, allows us to define attributes and behaviours which are commen to all animals in one class, so cheaters would natuarly inheart properities commen to all animals. + +The advantage of inheritance is that classes which would otherwise have alot of similar code , can instead shair the same code, thus reducing the complexity of the program. Inheritance, therefore, can also be refered to as polymorphism which is where many pieces of code are controled by shared control code. + +Inheritance can be accomplished by overriding methods in its ancestor, or by adding new methods. diff --git a/data/g0pC_taskb.txt b/data/g0pC_taskb.txt new file mode 100755 index 0000000..15fa7cd --- /dev/null +++ b/data/g0pC_taskb.txt @@ -0,0 +1,5 @@ +There are many attributes which infulance the ranking of a page in google, The main too are the content, key words, and links. The content of a webpage generaly gives a good idea about what the page is about, however, there are some flaws in this, for example, for along time ibm web page didnt contain the word computer dispite it being strongly associated with them. To solve this problem, web pages can assign itself key words, which contribute to its ranking in searches. + +The second method is the use of links. the more sights which links to your web page and the higher the rank of those sights, the higher the rank of your site will be. This method is used as links are seen as an adoursment of a sight. + +With both these methods of ranking web pages, there are issues. key words can be compromised by sparming, google solves this problem by penolizing such activity. Useing links to rank a page also has its problems, for example, link farms which have recursive links, for the sole perpos of raising there ranking, google takels this by useing a dampaning algorthem. diff --git a/data/g0pC_taskc.txt b/data/g0pC_taskc.txt new file mode 100755 index 0000000..00b9b17 --- /dev/null +++ b/data/g0pC_taskc.txt @@ -0,0 +1,6 @@ +The vector space model is where each document is viewed as a bag of words, where there order has little significance. Each document is a vector where each word is a dimension. The vector is then constucted of the frequency of eacher word (dimension). The draw back to this approach is that the length of the document as an inpact on the vector, to compensate for this you can comput the cosine similarity between your two comparism documents. This will find the difference between the two vectors (the dot product), ignoreing the size of them. + +Inorder to query the search space, the query can also be represented as a vector, then you find the document whos vector has the greatest cosine similarities to your query. There are a number of wighting sceems which can be incoperated inorder to increase the accuracy of the vextors. + +There are some drawbacks with this approach, Computing the cosine similarities between each vector can be expensive as the number of dimensions can be in the thousands, To tackle this problem you can use inverted indexs and then a series heuristics inorder to inprove on this. +to top diff --git a/data/g0pC_taskd.txt b/data/g0pC_taskd.txt new file mode 100755 index 0000000..1a321c3 --- /dev/null +++ b/data/g0pC_taskd.txt @@ -0,0 +1,17 @@ +In probability theory; Bayes theorem (often called Bayes law after Rev +Thomas Bayes) relates the conditional and marginal probabilities of +two random events. It is used to compute posterior probabilities given +observations. For example; a person may be observed to have certain +symptoms. Bayes theorem can be used to compute the probability that a +proposed diagnosis is correct. + +As a formal theorem Bayes theorem is valid in all common +interpretations of probability. However, it plays a central role in +the debate around the foundations of statistics: frequentist and +Bayesian interpretations disagree about the ways in which +probabilities should be assigned to each other. Bayesians describe +probabilities in terms of beliefs and degrees of uncertainty, While +frequentists assign probabilities to random events according to their +frequencies of occurrence or to subsets of populations as proportions +of the whole. The articles on Bayesian probability and frequentist +probability discuss these debates in detail. diff --git a/data/g0pC_taske.txt b/data/g0pC_taske.txt new file mode 100755 index 0000000..610a70c --- /dev/null +++ b/data/g0pC_taske.txt @@ -0,0 +1,5 @@ +In computer science; dynamic programming is a way of solving problems consist of overlapping subproblems and optimal substructure. The method is more effiecent than naive methods. + +The term was first coined in the 1940s by Richard Bellman to describe the process of solving problems where you need to find the best decisions consecutavly. In 1953 he had refined this to the modern meaning. The field was founded as a systems analysis and engineering topic that is recognized by the IEEE. Bellman equation is a central result of dynamic programming which restates an optimization problem in recursive form. + +dynamic programming has little connection to computer programming at all, and instead comes from the term mathematical programming, a synonym for optimization. Thus, the program is the best plan for action that is produced. For instance, a events schedule at an exhibition is sometimes called a program. Programming means finding a plan of action. diff --git a/data/g0pD_taska.txt b/data/g0pD_taska.txt new file mode 100755 index 0000000..76ac667 --- /dev/null +++ b/data/g0pD_taska.txt @@ -0,0 +1,5 @@ +Inheritance in object oriented programming is a way to form new classes using classes that have already been defined. The new classes, known as derived classes, inherit attributes and behaviour of the existing classes, which are referred to as base classes. With little or no modification, it is intended to help reuse existing code. It is typically accomplished either by overriding one or more methods exposed by ancestor, or by adding new methods to those exposed by an ancestor + +Inheritance is also sometimes called generalization, because there is-a relationships represent a hierarchy between classes of objects. A ‘fruit’, for instance, is a generalization of "orange", "mango", "apples" and many others. One can consider fruit to be an abstraction of apple, orange, etc. Since apples are fruit (i.e., an apple is-a fruit), conversely apples may naturally inherit all the properties common to all fruit, such as being a fleshy container for the seed of a plant. + +An advantage of inheritance is that modules with sufficiently similar interfaces can share a lot of code reducing the complexity of the program. diff --git a/data/g0pD_taskb.txt b/data/g0pD_taskb.txt new file mode 100755 index 0000000..73c9ae0 --- /dev/null +++ b/data/g0pD_taskb.txt @@ -0,0 +1,3 @@ +PageRank algorithm is patented by Stanford University. It is a link analysis algorithm employed by the Google Internet search engine that assigns a value used to measure the importance to each element of a hyperlinked set of documents, such as the WWW, with the purpose of ” measuring" its relative significance within the set. + +Google owns exclusive license rights on the patent from Stanford University. The University received 1.8 million shares in Google in return for use of the patent. diff --git a/data/g0pD_taskc.txt b/data/g0pD_taskc.txt new file mode 100755 index 0000000..6db8b23 --- /dev/null +++ b/data/g0pD_taskc.txt @@ -0,0 +1,5 @@ +An algebraic model for representing text documents and any objects in general is known by the name Vector space model. It represents these as vectors of identifiers, index terms are one illustration of these. The Vector Space model was first used in the SMART Information Retrieval System, and it is utilised variously in indexing, information filtering, indexing and information retrieval. + +A document has representation as a vector. Every dimension is precisely related to a separate term. The way in which term is defined depends entirely on the application: typically ‘terms’ are either single words, keywords or longer phrases. The dimensionality of the vector is the number of words in the vocabulary, if it is the words that are chose to be the terms. So the same rule applies with keywords and indeed longer phrases. + +If a term occurs in the document, its value in the vector is non-zero. Several different ways of computing these values, additionally known as (term) weights, have been developed. One of the most famous schemes is tf-idf weighting. diff --git a/data/g0pD_taskd.txt b/data/g0pD_taskd.txt new file mode 100755 index 0000000..c054aa4 --- /dev/null +++ b/data/g0pD_taskd.txt @@ -0,0 +1 @@ +Baye’s theorm in connection with conditional probabilities is of fundamental importance, since it permits a calculation of PROB(AB) from PROB(BA). Statistical information that is often gathered in great volume can therefore be avoided diff --git a/data/g0pD_taske.txt b/data/g0pD_taske.txt new file mode 100755 index 0000000..40a7e23 --- /dev/null +++ b/data/g0pD_taske.txt @@ -0,0 +1,3 @@ +Dynamic programming (DP) is an extremely powerful, general tool for solving optimization difficulties on left-right-ordered item, for example character strings. It is similar to divide and conquer, however is differentiated as its subproblems are not independent. It is easily applicable, in relative terms, once understood. However until one has witnessed enough examples, it looks like magic. + +DP minimizes computation by solving subproblems from the base upwards, storing solution to a subproblem when it is initially conquered, and looking up the solution when the subproblem is experienced for a second time. diff --git a/data/g0pE_taska.txt b/data/g0pE_taska.txt new file mode 100755 index 0000000..f1d7243 --- /dev/null +++ b/data/g0pE_taska.txt @@ -0,0 +1 @@ +In object-oriented programming, inheritance is a way to form new classes (instances of which are called objects) using classes that have already been defined. The inheritance concept was invented in 1967 for Simula. The new classes, known as derived classes, take over (or inherit) attribute and behaviour of the pre-existing classes, which are referred to as base classes (or ancestor classes). It is intended to help reuse existing code with little or no modification. Inheritance provides the support for representation by categorization in computer languages. Categorization is a powerful mechanism number of information processing, crucial to human learning by means of generalization (what is known about specific entities is applied to a wider group given a belongs relation can be established) and cognitive economy (less information needs to be stored about each specific entity, only its particularities). Inheritance is also sometimes called generalization, because the is-a relationships represent a hierarchy between classes of objects. For instance, a "fruit" is a generalization of "apple", "orange", "mango" and many others. One can consider fruit to be an abstraction of apple, orange, etc. Conversely, since apples are fruit (i.e., an apple is-a fruit), apples may naturally inherit all the properties common to all fruit, such as being a fleshy container for the seed of a plant. An advantage of inheritance is that modules with sufficiently similar interfaces can share a lot of code, reducing the complexity of the program. Inheritance therefore has another view, a dual, called polymorphism, which describes many pieces of code being controlled by shared control code. Inheritance is typically accomplished either by overriding (replacing) one or more methods exposed by ancestor, or by adding new methods to those exposed by an ancestor. diff --git a/data/g0pE_taskb.txt b/data/g0pE_taskb.txt new file mode 100755 index 0000000..1a356f2 --- /dev/null +++ b/data/g0pE_taskb.txt @@ -0,0 +1 @@ +PageRank is a link analysis algorithm used by the Google Internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents, such as the World Wide Web, with the purpose of "measuring" its relative importance within the set. The algorithm may be applied to any collection of entities with reciprocal quotations and references. PageRank Uses in google toolbar: Measures popularity of a site ,Marketing value,Updated periodically, in google directory: PageRank: sort links within categories;Volunteers evaluate, classify, annotate;Open Directory project using PageRank. diff --git a/data/g0pE_taskc.txt b/data/g0pE_taskc.txt new file mode 100755 index 0000000..47dccdd --- /dev/null +++ b/data/g0pE_taskc.txt @@ -0,0 +1 @@ +The representation of a set of documents as vectors in a common vector space is known as the vector space vector space model and is fundamental to a host of information retrieval (IR) operations including scoring documents on a query, document classification, and document clustering.We first develop the basic ideas underlying vector space scoring; a pivotal step in this development is the view of queries as vectors in the same vector space as the document collection. diff --git a/data/g0pE_taskd.txt b/data/g0pE_taskd.txt new file mode 100755 index 0000000..aa3285a --- /dev/null +++ b/data/g0pE_taskd.txt @@ -0,0 +1 @@ +Bayes Theorem is an important theorem relating conditional probabilities, it allows us to calculate PROB(A|B) from PROB(B|A). Bayes Theorem is important because it can save us from gathering vast amounts of statistical evidence. The main theory is PROB(A|B) = PROB(B|A) * PROB(A) /PROB(B), it means Using PROB(WIN|RAIN) from earlier, we can find the probability that it rained on a day that Harry won a race. diff --git a/data/g0pE_taske.txt b/data/g0pE_taske.txt new file mode 100755 index 0000000..65e735c --- /dev/null +++ b/data/g0pE_taske.txt @@ -0,0 +1 @@ +dynamic programming is a method of solving problems that exhibit the properties of overlapping subproblems and optimal substructure (described below). The method takes much less time than naive methods. The word "programming" in "dynamic programming" has no particular connection to computer programming at all, and instead comes from the term "mathematical programming", a synonym for optimization. Thus, the "program" is the optimal plan for action that is produced. For instance, a finalized schedule of events at an exhibition is sometimes called a program. Programming, in this sense, means finding an acceptable plan of action, an algorithm. diff --git a/data/g1pA_taska.txt b/data/g1pA_taska.txt new file mode 100755 index 0000000..926811d --- /dev/null +++ b/data/g1pA_taska.txt @@ -0,0 +1 @@ +In object oriented programming, objects are grouped together into classes according to their type, structure and the functions that can be performed on them. Inheritance is a process in object oriented programming in which objects acquire (or inherit) the properties of objects of another class. It is therefore used to create relationships between one object and another. Each class groups together objects of a similar type, with similar properties. New classes can be formed by this process whose objects will have properties of both the classes from which this new class is formed. A superclass has all of the properties of the subclasses below it. At the same time subclasses are each distinctive from each other but related via the superclass. Subclasses are said to ‘extend’ superclasses. Due to these relationships, object oriented programmes tend to be easier to modify since they do not need to be changed when a new object, with different properties is added. Instead, a new object is made to inherit properties of objects which already exist. Inheritance can be divided into two main processes: single inheritance and multiple inheritance. Single inheritance means that the class can only inherit from one other class, whereas multiple inheritance allows for inheritance from several classes. diff --git a/data/g1pA_taskb.txt b/data/g1pA_taskb.txt new file mode 100755 index 0000000..ea8ebaf --- /dev/null +++ b/data/g1pA_taskb.txt @@ -0,0 +1,3 @@ +The PageRank algorithm is used to designate every aspect of a set of hyperlinked documents with a numerical weighting. It is used by the Google search engine to estimate the relative importance of a web page according to this weighting. The system uses probability distribution to determine the odds that a person randomly clicking on links will arrive at any given page. Following this, each web page is given a ranking of 0-10 according to its relevance to a search. The PageRank is calculated by taking into consideration the number of inbound links, and the PageRank of the pages supplying these links. This means therefore that if a webpage is linked to others that have a high ranking, then it too will receive a high rank. + +Due to the nature of the PageRank system, it is susceptible to manipulation and has been exploited so that certain pages are given a false, exaggerated ranking. In these cases, only Goggle has access to the genuine PageRank. However, much research has been conducted into methods of avoiding links from documents with a false PageRank to try and iron out the bugs in this system and from 2007 Google has actively penalized schemes which try to increase rankings artificially. diff --git a/data/g1pA_taskc.txt b/data/g1pA_taskc.txt new file mode 100755 index 0000000..c2418cb --- /dev/null +++ b/data/g1pA_taskc.txt @@ -0,0 +1,6 @@ +The vector space model is an algebraic model used to represent text documents (and any objects, generally) as vectors of identifiers, for instance index terms. Its applications include information filtering, information retrieval, indexing and relevancy rankings. With reference to this model, documents are represented as vectors. Each dimension corresponds to a separate term. The value of a vector is non-zero if a term occurs in the document. Several different ways have been developed of calculating these values (also known as term weights). One of the best known schemes is tf-idf (term frequency-inverse document frequency) weighting. + +The model can be used to determine the relevancy rankings of documents in a keyword search, using the assumptions of document similarities theory, by comparing the original query vector (where the query is represented as same kind of vector as the documents) and the deviation of angles between each document vector. + +The classic vector space model was put forward by Salton, Wong and Yang and is known as term frequency-inverse document frequency model. In this classic model the term specific weights in the document vectors are products of local and global parameters. In a simpler Term Count Model the term specific weights are just the counts of term occurrences and therefore do not include the global parameter. + diff --git a/data/g1pA_taskd.txt b/data/g1pA_taskd.txt new file mode 100755 index 0000000..40787d5 --- /dev/null +++ b/data/g1pA_taskd.txt @@ -0,0 +1,3 @@ +Bayes' theorem relates the conditional and marginal probabilities of two random events and is named after the Reverend Thomas Bayes (1702–1761), who studied how to compute a distribution for the parameter of a binomial distribution. It is valid in all common interpretations of probability. It plays a central role in the debate around the foundations of statistics: frequentist and Bayesian interpretations disagree about the ways in which probabilities should be assigned in applications. Frequentists assign probabilities to random events according to their frequencies of occurrence or to subsets of populations as proportions of the whole, while Bayesians describe probabilities in terms of beliefs and degrees of uncertainty. Applications of Bayes' theorem often assume the philosophy underlying Bayesian probability that uncertainty and degrees of belief can be measured as probabilities. One of Bayes' results (Proposition 5) gives a simple description of conditional probability, and shows that it can be expressed independently of the order in which things occur: +If there be two subsequent events, the probability of the second b/N and the probability of both together P/N, and it being first discovered that the second event has also happened, from hence I guess that the first event has also happened, the probability I am right [i.e., the conditional probability of the first event being true given that the second has also happened] is P/b. +Note that the expression says nothing about the order in which the events occurred; it measures correlation, not causation. diff --git a/data/g1pA_taske.txt b/data/g1pA_taske.txt new file mode 100755 index 0000000..ab73aeb --- /dev/null +++ b/data/g1pA_taske.txt @@ -0,0 +1,4 @@ +Dynamic programming is an algorithmic technique used to solve certain optimization problems where the object is to find the best solution from a number of possibilities. It uses a so called ‘bottom-up’ approach, meaning that the problem is solved as a set of sub-problems which in turn are made up of sub-sub-problems.Sub-problems are then selected and used to solve the overall problem. These sub-problems are only solved once and the solutions are saved so that they will not need to be recalculated again. Whilst calculated individually, they may also overlap. When any sub-problem is met again, it can be found and re-used to solve another problem. Since it searches all possibilities, it is also very accurate. This method is far more efficient than recalculating and therefore considerably reduces computation. It is widely used in computer science and can be applied for example, to compress data in high density bar codes. + +Dynamic programming is most effective and therefore most often used on objects that are ordered from left to right and whose order cannot be rearranged. This means it works well on character chains for example. + diff --git a/data/g1pB_taska.txt b/data/g1pB_taska.txt new file mode 100755 index 0000000..a5befdc --- /dev/null +++ b/data/g1pB_taska.txt @@ -0,0 +1,2 @@ +Inheritance is one of the basic concepts of Object Oriented Programming. Its objective is to add more detail to pre-existing classes whilst still allowing the methods and variables of these classes to be reused. The easiest way to look at inheritance is as an is a kind of relationship. For example, a guitar is a kind of string instrument, electric, acoustic and steel stringed are all types of guitar. +The further down an inheritance tree you get, the more specific the classes become. An example here would be books. Books generally fall into two categories, fiction and non-fiction. Each of these can then be sub-divided into more groups. Fiction for example can be split into fantasy, horror, romance and many more. Non-fiction splits the same way into other topics such as history, geography, cooking etc. History of course can be sub-divided into time periods like the Romans, the Elizabethans, the World Wars and so on. diff --git a/data/g1pB_taskb.txt b/data/g1pB_taskb.txt new file mode 100755 index 0000000..84dfd71 --- /dev/null +++ b/data/g1pB_taskb.txt @@ -0,0 +1 @@ +A websites page rank, is how important it is on the web. It is essentially a popularity meter. Popularity or importance is determined by the amount of links relating to the page there are, there are four different types. Inbound, links from other pages to yours. Outbound, links from your page to others. Dangling, links to a page which has no links to others. Deep, links to a specific page, usually bypassing the homepage. The page rank algorithm takes the probability of a random surfer becoming bored and requesting another random page (otherwise known as the dampening factor) away from 1 and divides this number by the number of pages in the system, adding it to the dampening factor multiplied by the page rank of a linked page divided by the number of outbound links on that linked page. Adding on this last section for every other page linked to from the original page. Google uses this algorithm to assist intentional surfers in finding the best websites to suit their needs. One of the problems with this popularity algorithm is that it is easily manipulated and can give false values, hence the frequent recalculating of page ranks. diff --git a/data/g1pB_taskc.txt b/data/g1pB_taskc.txt new file mode 100755 index 0000000..5886b64 --- /dev/null +++ b/data/g1pB_taskc.txt @@ -0,0 +1,4 @@ +The algebraic model for representing text documents and objects as vectors of identifiers is called the vector space model. It is used in information filtering, indexing, relevancy rankings and information retrieval. It was first used in the SMART Information Retrieval System. + When a document is represented as a vector, each dimension corresponds to a separate term. A term which occurs in the document has a value in the vector of non-zero. Other ways of computing these values, or weights, have been developed. The most popular is tf-idf weighting. +Depending on the application, the definition of term varies. Single words, keywords and occasionally longer phrases are used for terms. The dimensionality of the vector, if words are used as terms, is the total number of words available for use. By using the assumptions of the document similarities theory, the relevancy rankings of documents in a keyword search can be worked out by comparing the deviation of angles between vectors both within the document and the original query where the vectors of both are the same type. +The limitations of the vector space model are thus. Due to poor similarity values long documents are poorly represented. False positive matches may be returned if search keywords do not precisely match document terms. False negative matches could be returned when documents share a context but have different term vocabulary. Vector space representation results in the loss of the order which the terms are in the document. diff --git a/data/g1pB_taskd.txt b/data/g1pB_taskd.txt new file mode 100755 index 0000000..17ecb8b --- /dev/null +++ b/data/g1pB_taskd.txt @@ -0,0 +1,3 @@ +Bayes theorem relates the conditional and marginal probabilities of two random events. It is mainly used to calculate the probability of one events outcome given that a previous event happened. For example, the probability that a doctors diagnosis is correct given that the doctor had previously observed symptoms in the patient. Bayes theorem can be used for all forms of probability, however it is currently at the centre of a debate concerning the ways in which probabilities should be assigned in applications. +The theorem states that the probability of Event A happening given Event B is the probability of B given A multiplied by the probability of A regardless of B all divided by the probability of B regardless of A which acts as a normalising constant. Bayes theorem formed in this way basically details how ones beliefs about Event A are renewed or updated knowing that Event B happened. When calculating conditional probabilities such as these, it is often useful to create a table containing the number of occurrences, or relative frequencies, of each outcome for each of the variables independently. + diff --git a/data/g1pB_taske.txt b/data/g1pB_taske.txt new file mode 100755 index 0000000..05e9969 --- /dev/null +++ b/data/g1pB_taske.txt @@ -0,0 +1,4 @@ +In mathematics and computer science, dynamic programming is a method of solving problems that exhibit the properties of overlapping sub problems and optimal substructure. The term was originally used in the 1940s by Richard Bellman to describe the process of solving problems where one needs to find the best decisions one after another. By 1953, he had refined this to the modern meaning. Bellman's contribution is remembered in the name of the Bellman equation, a central result of dynamic programming which restates an optimization problem in recursive form. The word "programming" in "dynamic programming" has no particular connection to computer programming at all, and instead comes from the term "mathematical programming", a synonym for optimization. Thus, the "program" is the optimal plan for action that is produced. For instance, a finalized schedule of events at an exhibition is sometimes called a program. Programming, in this sense, means finding an acceptable plan of action, an algorithm. +Dynamic programming usually takes one of two approaches, the top-down approach, the problem is broken into sub problems, and these sub problems are solved and the solutions remembered, in case they need to be solved again. This is recursion and memorization combined together and the bottom-up approach, all sub problems that might be needed are solved in advance and then used to build up solutions to larger problems. This approach is slightly better in stack space and number of function calls, but it is sometimes not intuitive to figure out all the sub problems needed for solving the given problem. +Some programming languages can automatically memorize the result of a function call with a particular set of arguments, in order to speed up call-by-name. Some languages make it possible portably (e.g. Scheme, Common Lisp or Perl), some need special extensions.This is only possible for a referentially transparent function. + diff --git a/data/g1pD_taska.txt b/data/g1pD_taska.txt new file mode 100755 index 0000000..d87f9a5 --- /dev/null +++ b/data/g1pD_taska.txt @@ -0,0 +1 @@ +Inheritance is a method of forming new classes using predefined classes. The new classes are called derived classes and they inherit the behaviours and attributes of the base classes. It was intended to allow existing code to be used again with minimal or no alteration. It also offers support for representation by categorization in computer languages; this is a powerful mechanism of information processing, vital to human learning by means of generalization and cognitive economy. Inheritance is occasionally referred to as generalization due to the fact that is-a relationships represent a hierarchy between classes of objects. Inheritance has the advantage of reducing the complexity of a program since modules with very similar interfaces can share lots of code. Due to this, inheritance has another view called polymorphism, where many sections of code are being controlled by some shared control code. Inheritance is normally achieved by overriding one or more methods exposed by ancestor, or by creating new methods on top of those exposed by an ancestor. Inheritance has a variety of uses. Each different use focuses on different properties, for example the external behaviour of objects, internal structure of an object, inheritance hierarchy structure, or software engineering properties of inheritance. Occasionally it is advantageous to differentiate between these uses, as it is not necessarily noticeable from context. diff --git a/data/g1pD_taskb.txt b/data/g1pD_taskb.txt new file mode 100755 index 0000000..4dd69f8 --- /dev/null +++ b/data/g1pD_taskb.txt @@ -0,0 +1 @@ +PageRank is a link analysis algorithm used by the Google Internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents, such as the World Wide Web, with the purpose of "measuring" its relative importance within the set. Google assigns a numeric weighting from 0-10 for each webpage on the Internet; this PageRank denotes a site’s importance in the eyes of Google. The PageRank is derived from a theoretical probability value on a logarithmic scale like the Richter Scale. PageRank is a probability distribution used to represent the likelihood that a person randomly clicking on links will arrive at any particular page. PageRank can be calculated for collections of documents of any size. It is assumed in several research papers that the distribution is evenly divided between all documents in the collection at the beginning of the computational process. The PageRank computations require several passes, called "iterations", through the collection to adjust approximate PageRank values to more closely reflect the theoretical true value. A probability is expressed as a numeric value between 0 and 1. A 0.5 probability is commonly expressed as a "50% chance" of something happening. Hence, a PageRank of 0.5 means there is a 50% chance that a person clicking on a random link will be directed to the document with the 0.5 PageRank. The PageRank theory holds that even an imaginary surfer who is randomly clicking on links will eventually stop clicking. The probability, at any step, that the person will continue is a damping factor d. Various studies have tested different damping factors, but it is generally assumed that the damping factor will be set around 0.85. diff --git a/data/g1pD_taskc.txt b/data/g1pD_taskc.txt new file mode 100755 index 0000000..76261e8 --- /dev/null +++ b/data/g1pD_taskc.txt @@ -0,0 +1 @@ +Within Information Retrieval each document in a set can be represented as a point in high-dimensional vector space, this representation is called the vector space model. Information Retrieval queries are also represented as vectors in the same vector space; these are then used in conjunction with the document vectors to find relevant documents. The two vectors are compared and the documents with a higher document-query similarity are ranked higher in terms of relevance. There are a variety of techniques that can be used to compare the two vectors; the most frequently used method for the vector space model is the Cosine Coefficient, which calculates the angle between the two vectors and produces a value between 0 and 1. diff --git a/data/g1pD_taskd.txt b/data/g1pD_taskd.txt new file mode 100755 index 0000000..a9711c7 --- /dev/null +++ b/data/g1pD_taskd.txt @@ -0,0 +1 @@ +Bayes Theorem is a mathematical formula used to calculate conditional probabilities. Given the probability of event A given event B, Bayes Theorem can be used to calculate the probability of B given A. This is achieved using the conditional probability of B given A and the prior probabilities of both events A and B. For example: suppose there is a bag of coloured balls with 25 red ones and 75 black ones. Lucky Joe likes to predict the colour of the ball he selects and he is 80% accurate. Joe records all of his results and about 0.5% of the time he accidently records the wrong results. Using all of this information more probabilities can be inferred, including using Bayes Theorem to calculate various probabilities like Joe recording correctly if he guesses correctly or Joe recording incorrectly when his guess was correct (and other like combinations). diff --git a/data/g1pD_taske.txt b/data/g1pD_taske.txt new file mode 100755 index 0000000..7729d90 --- /dev/null +++ b/data/g1pD_taske.txt @@ -0,0 +1,5 @@ +Dynamic programming is a faster method of solving problems that make use of optimal substructure, overlapping sub-problems and memoization. It has no relationship to computer programming; instead it is a process of finding a satisfactory algorithm. + +Optimal substructure is the process of using the optional solutions to sub problems to find the optimal solution to the overall problem. When the same sub problem solutions can be used to solve various bigger problems it is said to have overlapping-sub problems. Memoization is used in order to save time the solutions are stored rather than be recomputed. A solution can be disposed of once we are positive that it will no longer be required, in some cases a solution to a future problem can be computed in advance. + +There are two main approaches for dynamic programming. The first is the bottom up approach. Although it is not always simple to find all of them, any required sub problems are solved in advance and then used to create solutions to larger problems. The other method is the top down approach which is a method that combines memorization and recursion. The main problem is divided into sub problems which are solved and stored for future use. diff --git a/data/g2pA_taska.txt b/data/g2pA_taska.txt new file mode 100755 index 0000000..376a47d --- /dev/null +++ b/data/g2pA_taska.txt @@ -0,0 +1,10 @@ +Inheritance allows programs developed in an Object Orientated language to reuse code without having it replicated unnecessarily elsewhere within the program. + +To achieve this, the programmer has to note generalisations and similarities about various aspects of the program. + +For example, a program could exist to model different forms of transport. At first glance, a car and a train may not have much in common. But abstractly, both will have a speed at which they are travelling, a direction, and a current position. +Methods utilising this data can be specified high up in the inheritance hierarchy, for example in a Transport class. For example you could have a method which works out the new position of a train after travelling x minutes in direction y. Likewise, you might want to be able to find out the same information for an object of the type car. +Inheritance means that if such a method was defined in the superclass of the train and car classes, any car or train object can utilise it. + +The train and car subclasses are said to extend the Transport class, as they will have additional characteristics which they dont share. E.g. passenger capacity would be a class variable of both car and train (but have different values), and a train may have methods along the lines of is toilet engaged. +If you then wanted to add additional forms of transport, such as an aeroplane, you may wish for that also to have a toilet engaged function. Then you could have an extended hierarchy, where a Mass Transport class extends the Transport class. Under which youd have a train and aeroplane, which would inherit characteristics from both super classes. \ No newline at end of file diff --git a/data/g2pA_taskb.txt b/data/g2pA_taskb.txt new file mode 100755 index 0000000..69e3728 --- /dev/null +++ b/data/g2pA_taskb.txt @@ -0,0 +1,5 @@ +The algorithm that Google uses to assign a weighting to each element of a linked set of documents, with the purpose of "measuring" its relative importance within the set. +A particular websites PageRank results from a "vote" from other pages on the Internet about how important that website actually is. A link to a page is seen as a vote of support. The PageRank depends on the PageRank rating and number of all pages that have links to it. Additionally, if a page is linked to by pages with a high PageRank rating, this increases the rating of the original page. +The PageRank scale ranges from 0-10. The rating of a certain page is generally based upon the quantity of inbound links as well as the perceived quality of the pages providing the links. +PageRank could be described as a probability distribution representing the chance that someone randomly clicking on links will reach a certain page. The PageRank calculations require iterations through the collection of web pages to alter approximate PageRank values to accurately reflect the actual rank. +In order to prevent spamming, Google releases little information on the way in which a PageRank is calculated. The PageRank algorithm has led to many sites being spammed with links in an attempt to artificially inflate the PageRank of the linked page, notably in blog comments and message boards. In 2005 a nofollow tag was added as an attribute of a HTML link to be used where Google shouldnt change the PageRank of the linked page as a result of the link. \ No newline at end of file diff --git a/data/g2pA_taskc.txt b/data/g2pA_taskc.txt new file mode 100755 index 0000000..deee41d --- /dev/null +++ b/data/g2pA_taskc.txt @@ -0,0 +1,8 @@ +A Vector space model (or term vector model) is an algebraic way of representing text documents (and any objects, in general) as vectors of identifiers, such as index terms. It is used in information filtering, information retrieval, indexing and relevancy rankings. Its first application was in the SMART Information Retrieval System. +A document can be represented as a vector. Every dimension relates to a different term. If a term appears in the document, the terms value in the vector is non-zero. Many different methods of calculating these values, sometimes known as (term) weights, have been developed. tf-idf weighting is one of the most well known schemes. (see below example). +The definition of a term depends on the application. Normally a term is a single word, keyword, or a longer phrase. If the words are chosen to be the terms, the dimensionality of the vector is the number of words in the vocabulary (the number of distinct words occurring in the corpus). +The vector space model has some limitations: +1. Longer documents are represented poorly because the documents have poor similarity values (namely a small scalar product and a large dimensionality) +2. Search keywords have to precisely match document terms; word substrings could potentially result in a "false positive match" +3. Semantic sensitivity; documents with a similar context, but different term vocabulary won't be associated, resulting in a "false negative match". +4. The order in which terms appear in the document is lost in a vector space representation. diff --git a/data/g2pA_taskd.txt b/data/g2pA_taskd.txt new file mode 100755 index 0000000..f6a3b3e --- /dev/null +++ b/data/g2pA_taskd.txt @@ -0,0 +1,8 @@ + In probability theory, Bayes' theorem (often called Bayes' law after Rev Thomas Bayes) relates the conditional and marginal probabilities of two random events. It is often used to compute posterior probabilities given observations. For example, a patient may be observed to have certain symptoms. Bayes' theorem can be used to compute the probability that a proposed diagnosis is correct, given that observation. +As a formal theorem, Bayes' theorem is valid in all common interpretations of probability. However, it plays a central role in the debate around the foundations of statistics: frequentist and Bayesian interpretations disagree about the ways in which probabilities should be assigned in applications. +Suppose there is a co-ed school having 60% boys and 40% girls as students. The girl students wear trousers or skirts in equal numbers; the boys all wear trousers. An observer sees a (random) student from a distance; all they can see is that this student is wearing trousers. What is the probability this student is a girl? The correct answer can be computed using Bayes' theorem. +The event A is that the student observed is a girl, and the event B is that the student observed is wearing trousers. To compute P(A|B), we first need to know: +P(B|A'), or the probability of the student wearing trousers given that the student is a boy. This is given as 1. +P(A), or the probability that the student is a girl regardless of any other information. Since the observers sees a random student, meaning that all students have the same probability of being observed, and the fraction of girls among the students is 40%, this probability equals 0.4. +P(A'), or the probability that the student is a boy regardless of any other information (A' is the complementary event to A). This is 60%, or 0.6. +P(B|A), or the probability of the student wearing trousers given that the student is a girl. As they are as likely to wear skirts as trousers, this is 0.5 \ No newline at end of file diff --git a/data/g2pA_taske.txt b/data/g2pA_taske.txt new file mode 100755 index 0000000..d1407c0 --- /dev/null +++ b/data/g2pA_taske.txt @@ -0,0 +1,9 @@ +Dynamic Programming is a very powerful mathematical technique, often utilised in programming, for solving optimization problems. Normally, minimizing or maximizing. +‘Greedy’ algorithms focus on making the best local choice at each decision making stage. Without a proof of correctness, such an algorithm is likely to fail. With Dynamic Programming, we can design our own algorithm which searches for all possibilities (which ensures correctness) whilst storing the results to avoid having to recomputed (leading to computational efficiency). +Dynamic Programming solves problems by combining the solutions of subproblems. These subproblems are not, however, independent. Subproblems can share subsubproblems, but the solution to one subproblem doesn’t necessarily affect the solutions to other subproblems stemming from the same problem. +Dynamic Programming reduces computation time by solving subproblems in a ‘bottom-up’ way. It stores the solution to a subproblem the first time it is solved, meaning that it can look up the solution when that subproblem is encountered subsequently. +The key to Dynamic Programming is to find the structure of optimal solutions. The steps required are as follows: +1. Generalise the structure of an optimal solution +2. Recursively define the value of an optimal solution +3. Compute the optimal solution values either top-down (with caching), or bottom-up using a table +4. Generate the optimal solution of these computed values diff --git a/data/g2pB_taska.txt b/data/g2pB_taska.txt new file mode 100755 index 0000000..6355405 --- /dev/null +++ b/data/g2pB_taska.txt @@ -0,0 +1,9 @@ + Inheritance is an important feature in object orientated programming. This is because it allows new classes to be made that extend previous classes and to go into more detail. + +This is carried out by allowing the new class to reuse the existing class methods and variables, whilst also creating class specific methods and variables. This means that the new class, the subclass, is a more specialised version of the original, or superclass. + +Because of this it means that the subclass can use all the public methods and variables from the superclass; however any private methods or variables are still private. + +Also it should be noted that a class can only extend one class, e.g. can only be a subclass to one superclass. However a superclass can have more then one subclass and a class can both be a subclass and a superclass. If this occurs then all of the non-private methods and variables can be used by the most specialised class. + +This means that inheritance is used when types have common factors and these would be put into the superclass. Then the subclass/es then extend these to add more detail. An example of this could be using a superclass of employee and then to have two subclasses called fulltime and part time. As employee could have name, address and other details whilst full time could just have salary and part time could work out the salary from part time hours worked, as the full time members of staff wouldnt need these. diff --git a/data/g2pB_taskb.txt b/data/g2pB_taskb.txt new file mode 100755 index 0000000..c4b53c6 --- /dev/null +++ b/data/g2pB_taskb.txt @@ -0,0 +1,9 @@ +The first thing to consider when talking about Googles PageRank algorithm, is that a PageRank is essentially how important that web page is to the internet. So in essence it is a popularity contest between WebPages. + +Originally search engines used highest keyword density, however this could be abused if keyword spamming was implemented. Instead Google uses a system that is based on sites linking to each other, e.g. the more important a site is that is linked to yours the higher your site will become. + +The algorithm Google actually users is based on 4 factors, total number of pages, dampening factor, PageRank of a single page and the number of outbound links. A dampening factor is used to counter random surfers, who get bored and then switch to other pages. This formula is then re-used until the results seem to converge together, to find the PageRank, so it is calculated iteratively. + +PageRank is used by Google to measure a popularity of the site and a number between 0-10 is assigned to each webpage depending on their PageRank. This allows Google to calculate a marketing value for different WebPages. + +Also it should be noted that the PageRank is periodically updated every 3 to 6 months, this is counter hackers influence on different PageRanks. diff --git a/data/g2pB_taskc.txt b/data/g2pB_taskc.txt new file mode 100755 index 0000000..59a189c --- /dev/null +++ b/data/g2pB_taskc.txt @@ -0,0 +1,4 @@ +A Vector space model is an algebraic model for representing text documents as vectors of identifiers. A possible use for a vector space model is for retrieval and filtering of information. Other possible uses for vector space models are indexing and also to rank the relevancy of differing documents. +To explain further vector space models, basically a document is characterized by a vector. With each separate term corresponding to the differing dimensions. There has been multiple ways of trying to compute the different possible values for vector space models with the most recognised being the tf-idf weighting. +The differing application has a direct influence on what the definition of the term means. A normal term is usually a single word, keywords or longer phrases. The number of unique words in the vocabulary denotes the dimensionality, if words are used for the terms. +However whilst vector space modelling is useful there are 4 key problems with using it, they are; that the order of the terms are lost, keywords must be precise if searched for, bigger documents have a poor similarity value due to being poorly represented and two documents based on the same topic wont be associated if term vocabulary differs. diff --git a/data/g2pB_taskd.txt b/data/g2pB_taskd.txt new file mode 100755 index 0000000..e7ec076 --- /dev/null +++ b/data/g2pB_taskd.txt @@ -0,0 +1,10 @@ + +In probability theory, Bayes' theorem also called Bayes' law after Rev Thomas Bayes compares the conditional and marginal probabilities of two random events. It is often used to calculate posterior probabilities given observations. For example, a patient may be observed to have certain symptoms. Bayes' theorem can be used to calculate the likelihood that a proposed analysis is accurate, given that observation. +As an official theorem, Bayes' theorem is valid in all universal interpretations of probability. However, it plays a fundamental role in the debate around the foundations of statistics: frequentist and Bayesian interpretations disagree about the ways in which probabilities should be assigned in applications. + Frequentists assign probabilities to random events according to their frequencies of happening or to subsets of populations as proportions of the whole. Whilst Bayesians describe probabilities in terms of beliefs and degrees of uncertainty. The articles on Bayesian probability and frequentist probability discuss these debates in greater detail. +Bayes' theorem compares the conditional and marginal probabilities of events A and B, where B has a non-vanishing probability. +Each term in Bayes' theorem has a conventional name: +P(A) is the previous probability of A. It is "previous" in the sense that it does not take into account any information about B. +P(A|B) is the conditional probability of A, given B. It is also called the subsequent probability because it is derived from or depends upon the specified value of B. +P(B|A) is the conditional probability of B given A. +P(B) is the previous. diff --git a/data/g2pB_taske.txt b/data/g2pB_taske.txt new file mode 100755 index 0000000..a6682d8 --- /dev/null +++ b/data/g2pB_taske.txt @@ -0,0 +1,11 @@ + +In mathematics and computer science, dynamic programming is a method of solving problems, that exhibit the properties of overlapping subproblems and optimal substructure. The method takes much less time than naive methods. +The term was originally used in the 1940s to describe the process of solving problems where one needs to find the best decisions one after another. +The field was founded as a systems analysis and engineering topic that is recognized by the IEEE +The word "programming" in "dynamic programming" has no particular connection to computer programming at all, and instead comes from the term "mathematical programming", a synonym for optimization. Thus, the "program" is the optimal plan for action that is produced. For instance, a finalized schedule of events at an exhibition is sometimes called a program. Programming, in this sense, means finding an acceptable plan of action, an algorithm. +Optimal substructure means that optimal solutions of subproblems can be used to find the optimal solutions of the overall problem. For example, the shortest path to a goal from a vertex in a graph can be found by first computing the shortest path to the goal from all adjacent vertices, and then using this to pick the best overall path. +In general, we can solve a problem with optimal substructure using a three-step process: +1.Break the problem into smaller subproblems. +2.solve these problems optimally using this three-step process recursively. +3.Use these optimal solutions to construct an optimal solution for the original problem. +The subproblems are, themselves, solved by dividing them into sub-subproblems, and so on, until we reach some simple case that is solvable in constant time. diff --git a/data/g2pC_taska.txt b/data/g2pC_taska.txt new file mode 100755 index 0000000..2414314 --- /dev/null +++ b/data/g2pC_taska.txt @@ -0,0 +1,7 @@ +Inheritance is a way to form new classes (instances of which are called objects) using classes that have already been defined. The new classes, known as derived classes, take over (or inherit) attributes and behavior of the pre-existing classes, which are referred to as base classes (or ancestor classes). It is intended to help reuse existing code with little or no modification. + +An advantage of inheritance is that modules with sufficiently similar interfaces can share a lot of code, reducing the complexity of the program. Inheritance therefore has another view, a dual, called polymorphism, which describes many pieces of code being controlled by shared control code. + +Inheritance is typically accomplished either by overriding (replacing) one or more methods exposed by ancestor, or by adding new methods to those exposed by an ancestor. + +In defining this inheritance hierarchy we have already defined certain restrictions, not all of which are desirable. Singleness: using single inheritance, a subclass can inherit from only one superclass. Visibility: whenever client code has access to an object, it generally has access to all the object's superclass data. Static: the inheritance hierarchy of an object is fixed at instantiation when the object's type is selected and does not change with time. diff --git a/data/g2pC_taskb.txt b/data/g2pC_taskb.txt new file mode 100755 index 0000000..6e9f1fa --- /dev/null +++ b/data/g2pC_taskb.txt @@ -0,0 +1,5 @@ +The PageRank is a recursive algorithm used by Google to determine which webpages are more important than others. The algorithm considers the importance of a webpage to be reflected by how many other webpages link to that page, and the importance of those pages. + +For each page that links to a page A, the PageRank between zero and one is calculated iteratively according to the following two key factors: The probability of a user navigating away from a page randomly; the PageRank of any page that links to A, divided by the total number of outbound links from that page. This assumes that a link among many outbound links is less valuable than a link among fewer outbound links. A variation of the PageRank method bases the importance of a webpage on how many visits the page gets. + +The method can be abused when people deliberately link to sites in order to raise a site's PageRank. However, it is still a good indicator for search engines to use as a variable in deciding on the most appropriate results according to a query. diff --git a/data/g2pC_taskc.txt b/data/g2pC_taskc.txt new file mode 100755 index 0000000..3938d4c --- /dev/null +++ b/data/g2pC_taskc.txt @@ -0,0 +1,9 @@ +In the vector space model (VSM), documents take the form of "bags of words" - a standard information retrieval approach which represents documents as in a mathematical "bag" structure, recording what terms are present and how often they occur. + +The vector space model is used in information retrieval to determine how similar documents are to one another, and how similar documents are to a search query. + +In a collection of documents, each document can be viewed as a vector of n values (the terms in the document), where each term is an axis. Queries can also be represented as vectors on this vector space model, and so deciding which document matches the query the closest becomes a matter of selecting the document vector which is nearest to the query vector. + +The query vector is compared to each document vector in turn using a "vector similarity measure", which is the cosine of the angle between the query vector and the document vector. + +This equation is calculated by dividing the dot product of the query vector and the document vector by the modulus of the query vector multiplied by the modulus of the document vector. The denominator takes into account differences in the length of the vector, and has the effect of "normalising" the length. Whichever document returns the highest cosine similarity score is considered to be the closest matching document to the query. diff --git a/data/g2pC_taskd.txt b/data/g2pC_taskd.txt new file mode 100755 index 0000000..47180e1 --- /dev/null +++ b/data/g2pC_taskd.txt @@ -0,0 +1,16 @@ +In probability theory, Bayes' theorem relates the conditional and marginal probabilities of two random events. It is often used to compute posterior probabilities given observations. + +Bayes' theorem is expressed mathematically as: + +P(A|B) = (P(B|A)P(A))/P(B) + +where P(A|B) is the conditional probability of A given B, P(A) is the prior probability of A, P(B) is the prior probability of B, and P(B|A) is the conditional probability of B given A. + +Bayes' theorem relates the conditional and marginal probabilities of two random events P(A) and P(B), and is valid in all common interpretations of probability. For example, in a school in made up of 3/5 boys and 2/5 girls, the girls wear skirts of trousers in equal numbers and the boys all wear trousers. If a student is observed from a distance wearing trousers, Bayes theorem can be used to determine the probability of this student being a girl. + +P(A) is the probability of the student being a girl (which is 2/5). +P(B|A) is the probability of the student wearing trousers given that the student is a girl, which is 0.5 +P(B) is the probability of a random student wearing trousers, which can be calculated as P(B) = P(B|A)P(A) + P(B|A')P(A') where ' denotes a complementary event, which is 0.8. +Therefore the probability using the formula is 0.25. + +Bayes theorem is often used to compute posterior probabilities given observations, for instance the probability that a proposed medical diagnosis is correct, given certain observed symptoms. diff --git a/data/g2pC_taske.txt b/data/g2pC_taske.txt new file mode 100755 index 0000000..2413ecb --- /dev/null +++ b/data/g2pC_taske.txt @@ -0,0 +1,5 @@ +Dynamic programming is a problem-solving method which solves recursive problems. The term is derived from mathematical programming which is commonly referred to as optimisation, hence dynamic programming is an optimal method of solving the problems and takes much less time than naïve methods. + +Dynamic programming uses the properties of optimal substructure, overlapping subproblems and memoization to create an algorithm to solve such problems. Optimal substructure means that the structure of the problem is made up of sub-problems which can be used to find the solution to the problem overall. A problem with overlapping subproblems means that the same subproblems may be used to solve many different larger problems. Each sub-problem is solved by being divided into sub-subproblems, until a case is reached which is solvable in constant time. Memoization stores solutions which have already been computed in order to reduce unnecessary re-computation. + +Dynamic programming can be divided into two main approaches: top-down and bottom-up. The top-down approach breaks the problem into subproblems, which are solved and remembered, using a combination of memoization and recursion. The bottom-up approach solves all subproblems that might be need in advance, and then uses these solutions to build up the solutions to the bigger problem. diff --git a/data/g2pE_taska.txt b/data/g2pE_taska.txt new file mode 100755 index 0000000..e8ca270 --- /dev/null +++ b/data/g2pE_taska.txt @@ -0,0 +1,6 @@ +When we talk about inheritance in object-oriented programming languages, which is a concept that was invented in 1967 for Simula, we are usually talking about a way to form new classes and classes are instances of which are called objects and involve using classes that have already been defined. +Derived classes are intended to help reuse existing code with little or no modification and are the new classes that take over (or inherit) attributes and behavior of the pre-existing classes, usually referred to as base classes (or ancestor classes). +Categorization in computer languages is a powerful way number of processing information and inheritance provides the support for representation by categorization. Furthermore, it is fundamental for helping humans learn by means of generalization in what is known about specific entities is applied to a wider group given a belongs relation can be established and cognitive processing which involves less information being acquired to be stored about each specific entity, but in actual fact only its particularities. +An instance of a "fruit" is a generalization of "apple", "orange", "mango" and many others. Inheritance can also sometimes be referred to as generalization, because is-a relationships represent a hierarchy amongst classes of objects. It can be considered that fruit is an abstraction of apple, orange, etc. Conversely, since apples are fruit, they may naturally inherit all the properties common to all fruit, such as being a fleshy container for the seed of a plant. + Modules with sufficiently similarities in interfaces would be able to share a lot of code and therefore reducing the complexity of the program. This can be known as one of the advantages of inheritance. Therefore inheritance can be known to have a further view, a dual, which describes many parts of code that are under control of shared control code, named as polymorphism. +On the other hand, inheritance is normally accomplished either by replacing one or more methods exposed by ancestor, or by adding new methods to those exposed by an ancestor. A well known term used for this replacing act is called overriding. diff --git a/data/g2pE_taskb.txt b/data/g2pE_taskb.txt new file mode 100755 index 0000000..d17fa9a --- /dev/null +++ b/data/g2pE_taskb.txt @@ -0,0 +1,2 @@ +PageRank is a link analysis algorithm that is used by search engine such as Google Internet that assigns a numerical weighting to every element of a hyperlinked set of documents, like the World Wide Web, with the hope of "measuring" the relative importance held in the set. The algorithm may be applied to any numbr of entities with reciprocal quotations and references. The weight taking a numerical value which assigns to any given element E is also known as the PageRank of E and is denoted by PR(E). +A trademark of Google has the name "PageRank" and this process has been patented (U.S. Patent 6,285,999 ). Nevertheless, the patent is assigned to the University of Stanford and not to Google. Google has exclusive license rights on the patent from the University of Stanford and the university received 1.8 million shares in Google in exchange for use of the patent; the in the year 2005, shares were sold for $336 million. diff --git a/data/g2pE_taskc.txt b/data/g2pE_taskc.txt new file mode 100755 index 0000000..48d0b2d --- /dev/null +++ b/data/g2pE_taskc.txt @@ -0,0 +1,2 @@ +nformation retrieval (IR) is the science of searching for documents, for information within documents and for metadata about documents, as well as that of searching relational databases and the World Wide Web. IR is interdisciplinary, based on computer science, mathematics, library science, information science, information architecture, cognitive psychology, linguistics, statistics and physics. There is overlap in the usage of the terms data retrieval, document retrieval, information retrieval, and text retrieval, but each also has its own body of literature, theory, praxis and technologies. +Automated information retrieval systems are used to reduce what has been called "information overload". Many universities and public libraries use IR systems to provide access to books, journals and other documents. diff --git a/data/g2pE_taskd.txt b/data/g2pE_taskd.txt new file mode 100755 index 0000000..8484a06 --- /dev/null +++ b/data/g2pE_taskd.txt @@ -0,0 +1,9 @@ +The Probability of an event happening mean considering the likelihood of or the number of the instance occurring, and dividing this value by the total number of events. The equation for this calculation would look as follows: +Probability (P) = number of instance / total number of events +On the other hand Probability Theory (P) usually involves assigning values to events. For example: +(P)=1: event is certain to occur +(P)=0: event is certain NOT to occur +(P)=0.5: event occurs half of the time. + +There is also Conditional Probability which is usually interested in the way variables relate to each other. Bayes Theorem is the name given to an important theorem relating +Conditional probabilities and it can be seen as a way of understanding how the probability that a theory is true, is affected by a new piece of evidence. It has been used in a wide variety of contexts, ranging from marine biology to the development of "Bayesian" spam blockers for email systems. diff --git a/data/g2pE_taske.txt b/data/g2pE_taske.txt new file mode 100755 index 0000000..dff023c --- /dev/null +++ b/data/g2pE_taske.txt @@ -0,0 +1,5 @@ +Dynamic Programming (DP) is in basic terms an algorithm design technique that is used for optimization problems and often involves minimizing or maximizing. +Furthermore, by combining solutions to subproblems, DP solves problems. Subproblems may include and contain many other subsubproblems and even in such cases, the solution to one subproblem may not affect the solutions to other subproblems involved in the same problem. +By solving subproblems in a bottom-up fashion, which is basically when storing solution to a subproblem the first time it is solved and looking up to find the solution when a subproblem is come across once more, this would cause DP to reduce computations. +The following is a generalization path to be taken in Dynamic Programming: +Firstly it is needed to Characterize the structure of an optimal solution. Secondly to define the value of the optimal solution recursively. Furthermore, to compute the optimal solution values either by following a top-down method with caching, or a bottom-up method in a table. The last point would be to construct an optimal solution from the computed values. diff --git a/data/g3pA_taska.txt b/data/g3pA_taska.txt new file mode 100755 index 0000000..420c94b --- /dev/null +++ b/data/g3pA_taska.txt @@ -0,0 +1 @@ +In object-oriented programming, inheritance is the ability to specify one class to be a subclass of another; this leads to a hierarchy of classes, with the child classes inheriting and specialising - and sometimes adding to - the functionality and data structures of the parent classes. The hierarchy that is formed is also useful for the organisation of classes and objects, as it defines a relationship between the child and the parent (the child class is a kind of the parent class). Inheritance is useful for situations where several classes share common features, such as needed functions or data variables. In addition to this, child classes can be referenced in terms of their parent classes, which can be useful when storing large data structures of objects of several classes, which can all be referenced as one base class. Inheritance is a core aspect of object-oriented programming, and is available in some form or another in most, if not all, object oriented languages available today. Most of these languages provide an extend keyword, which is used to subclass another. Also, the functions and data variables that are inherited by the subclasses can be controlled through the use of visibility modifiers. \ No newline at end of file diff --git a/data/g3pA_taskb.txt b/data/g3pA_taskb.txt new file mode 100755 index 0000000..fbf77d9 --- /dev/null +++ b/data/g3pA_taskb.txt @@ -0,0 +1,4 @@ +The Google search engine uses a link analysis algorithm called PageRank to assign a relative numerical importance to a set of hyperlinked documents, such as the World Wide Web. +For a given page, it's importance (the PageRank value) results from a ballot among all the other pages in the set. For a page to give a vote to another, it must link to it, and so the PageRank depends on the number of incoming links, anf the PageRank of those pages that provide the links. Pages that are linked to by many high ranking pages will themselves obtain a high rank. If a page has no incoming links, there is no support for that page. +The PageRank is a numeric weighting of 0 to 10, and denotes how important a site is in Google's eyes. Like the Richter Scale, the PageRank is a value on a logerithmic scale that is derived from a probability. In addition to the quantity and quality of inbound links,other factors affect the PageRank, such as the number of visits to the page and the search words that are used on the page. +To prevent sites from manipulating or spoofing PageRank, very little details are provided by Google as to what factors actually affect it. \ No newline at end of file diff --git a/data/g3pA_taskc.txt b/data/g3pA_taskc.txt new file mode 100755 index 0000000..0459168 --- /dev/null +++ b/data/g3pA_taskc.txt @@ -0,0 +1,11 @@ +Vector space model, or term vector model as it is also known, is an algebraic model for representing objects (although it is mainly used for text documents) as vectors of identifiers; for example, index terms. It is used in information retrieval and filtering, indexing and relevancy rankings, and was first used in the SMART Information Retrieval System. + +A document is represented as a vector, with each dimension corresponding to a separate term. If a term occurs in the document, the value will be non-zero in the vector. Many different ways of computing these values (aka (term) weights) have been developed; one of the best known schemes is tf-idf weighting. + +The way that a 'term' is defined depends on the application. Typically, terms are single words, keywords, or sometimes even longer phrases. If the words are chosen as the terms, the number of dimensions in the vector is the number of distinct words in the corpus. + +Relevancy ranks for documents, in a keyword search, can be calculated; this uses the assumptions of document similarities theory, by comparing the difference of angles between each document vector and the original query vector, where the query is represented as same format vector as the documents. + +Generally, it is easier to calculate the cosine of the angle between the vectors instead of the angle itself. A zero value for the cosine indicates that the query and document vector are orthogonal and so had no match; this means the query term did not exist in the document being considered. + +However, the vector space model has limitations. Long documents are poorly represented due to their poor similarity values (a small scalar product and a large dimensionality); search keywords must match precisely the document terms; word substrings might result in a "false positive match"; similar context documents but different term vocabulary won't be associated, leading to a "false negative match"; and the order that the terms appear in the document is not represented in the vector space model. diff --git a/data/g3pA_taskd.txt b/data/g3pA_taskd.txt new file mode 100755 index 0000000..cf09e01 --- /dev/null +++ b/data/g3pA_taskd.txt @@ -0,0 +1,15 @@ +In probability theory, Bayes' theorem (often called Bayes' law after Rev Thomas Bayes) relates the conditional and marginal probabilities of two random events. It is often used to compute posterior probabilities given observations (for example, a patient may be observed to have certain symptoms). Bayes' theorem can be used to compute the probability that a proposed diagnosis is correct, given that observation. + +As a formal theorem, Bayes' theorem is valid in all common interpretations of probability. However, it plays a central role in the debate around the foundations of statistics; frequentist and Bayesian interpretations disagree about the ways in which probabilities should be assigned in applications. Frequentists assign probabilities to random events according to their frequencies of occurrence or to subsets of populations as proportions of the whole, while Bayesians describe probabilities in terms of beliefs and degrees of uncertainty. The articles on Bayesian probability and frequentist probability discuss these debates in greater detail. + +Bayes' theorem relates the conditional and marginal probabilities of events A and B, where B has a non-vanishing probability: + + P(A|B) = (P(B | A) x P(A)) / P(B). + +Each term in Bayes' theorem has a conventional name: +P(A) is the prior probability or marginal probability of A. It is "prior" in the sense that it does not take into account any information about B. + P(A|B) is the conditional probability of A, given B. It is also called the posterior probability because it is derived from or depends upon the specified value of B. +P(B|A) is the conditional probability of B given A. +P(B) is the prior or marginal probability of B, and acts as a normalizing constant. + +Intuitively, Bayes' theorem in this form describes the way in which one's beliefs about observing 'A' are updated by having observed 'B'. \ No newline at end of file diff --git a/data/g3pA_taske.txt b/data/g3pA_taske.txt new file mode 100755 index 0000000..5db4328 --- /dev/null +++ b/data/g3pA_taske.txt @@ -0,0 +1,4 @@ +In the field of computer science, term 'dynamic programming' relates to the style of programming that breaks a large problem down into smaller subproblems, and generally allows for the finding of the optimal solution. When the problem is split into subproblems, these themselves may be split into smaller problems, and so on, until they cannot be reduced any more. +It is also common for dynamic programming to make use of recursion, and the saving of previous results for faster computation later; this also leads to higher efficiency, as calculations are not being redone. For example, when a problem is reduced into sub problems, and those are then reduced further, it may be that there are common subsubproblems, and so only one calculation needs to be done and the result saved to help solve more than one subproblem. +An example of this gain in efficiency is a path-finding problem. If there are two distinct routes in a network of 10 nodes, tagged A to J, then if the two routes share a common section (say, between nodes B and D), the cost of that section should be calculated for the first route and saved. Then, when the second route is being processed, the cost of B to D does not need to be calculated again. +In general, dynamic programming is used on optimisation problems, where the most efficient solution is needed. Areas where this sort of programming is useful is in AI, computer graphics, compression routines, and biomedical applications. \ No newline at end of file diff --git a/data/g3pB_taska.txt b/data/g3pB_taska.txt new file mode 100755 index 0000000..abb6b11 --- /dev/null +++ b/data/g3pB_taska.txt @@ -0,0 +1,7 @@ +Inheritance is a concept in Object Oriented programming where a child- or sub-class inherits characteristics from a parent- or super-class. The concept takes its name from genetic inheritance where a child can inherit genetic characteristics from its parents. + +Inheritance, at its simplest, allows programmers to model a relationship where one object is a kind of another. For instance two classes, one representing an undergraduate student and another representing a post-graduate student could both be said to belong to a more generalised class representing all students. Similarly, we could say that dogs and cats are two kinds of animal, or that bridges and skyscrapers are two types of man-made structure. + +Subclasses are said to extend or specialise their superclasses. Attributes (variables) and behaviours (functions) that are common between classes can be included in the definition of the superclass, leaving the subclass definitions containing only the attributes and behaviours that are unique to that class. + +Inheritance can be used to create a multiple level architecture of classes. In such an architecture even the bottom-most subclasses inherit all of the attributes and behaviours that are defined in the very top-most superclasses. This can save the programmer time because it renders unnecessary a lot of code duplication. \ No newline at end of file diff --git a/data/g3pB_taskb.txt b/data/g3pB_taskb.txt new file mode 100755 index 0000000..3861ae0 --- /dev/null +++ b/data/g3pB_taskb.txt @@ -0,0 +1,7 @@ +PageRank is an algorithm that was developed by Google to provide the most relevant search results to its users queries. PageRank, along with similar algorithms developed by Google’s competitors for their search engines, is part of the second generation of technologies designed to rate the importance of web pages: the first, which was solely based on keywords in the page content and meta-data, could easily be influenced by those wishing to obtain a higher ranking for their less-relevant pages. + +The different with PageRank is that it tries to determine a web page’s relevance to users by attempting to determine its importance. It does this by assigning it a value of importance that is dependant upon the number of web sites that link to that page, taking into account the importance value, or PageRank, of those pages. The PageRank is computed iteratively, and it is found that the PageRank values converge fairly rapidly. + +Although it is much better than simple keyword-based ranking algorithms, PageRank is not infallible: we have an internet where advertising revenue can make up most - and quite frequently all – of a web site’s income and the people that run these web sites will always be trying to trick the system into giving their pages a higher PageRank. One of Google’s attempts to counter this is their Google Toolbar browser plugin. + +Google Toolbar is a free tool which provides a number of useful functions in a convenient location: the users web browser window. Google’s payoff is that it gets to track the behaviour of actual users. This allows them to see whether their PageRank algorithm is accurate in assigning high PageRank values to the most relevant web pages and, just as importantly, low values to those that are irrelevant and try to fool the system. diff --git a/data/g3pB_taskc.txt b/data/g3pB_taskc.txt new file mode 100755 index 0000000..f1f4c5b --- /dev/null +++ b/data/g3pB_taskc.txt @@ -0,0 +1,7 @@ +There are a large number of models used in solving the problem of Information Retrieval and they are all based on one of three mathematical bases: set theory, algebra and probabilistic. The vector space model is one of these methods, and it is an algebraic model. + +In the vector space model a document is represented as a vector. Within this vector, each dimension corresponds to a separate term (where a term is typically a single word, keyword or phrase.) If the term doesn’t occur within the document, the value in the vector is zero. If a term occurs in the document, its value is non-zero. + +To calculate how relevant each document is in a keyword search the cosine value of the angle between the vectors is easier to calculate instead of the actual angle. + +The vector space model, however, is not without its limitations: they have small similarity values, long documents are poorly represented; the order of words does not matter; false positive matches may be brought about by terms contained within words themselves; and documents that should match but use different semantics may return false negative matches. There are a number of other models that are based on or extend the vector space model, and these are designed to try to eradicate these problems. diff --git a/data/g3pB_taskd.txt b/data/g3pB_taskd.txt new file mode 100755 index 0000000..fa24e97 --- /dev/null +++ b/data/g3pB_taskd.txt @@ -0,0 +1,5 @@ +Bayes' theorem (often called Bayes' law) connects the conditional and marginal probabilities of two arbitrary events. One of its uses is calculating posterior probabilities given observations. + +Bayes' theorem plays a key role in the debate around the principles of statistics: frequentist and Bayesian interpretations disagree about the ways in which probabilities should be assigned in applications. + +Bayes' theorem is useful in evaluating the result of drug tests. If a test can identify a drug user 99% of the time, and can identify a non-user as testing negative 99% of the time, it may seem to be a relatively accurate test. However, Bayes' theorem will reveal the flaw that despite the apparently high accuracy of the test, the probability that an employee who tested positive actually did use drugs is only about 33%. \ No newline at end of file diff --git a/data/g3pB_taske.txt b/data/g3pB_taske.txt new file mode 100755 index 0000000..3fdee5b --- /dev/null +++ b/data/g3pB_taske.txt @@ -0,0 +1,7 @@ +Dynamic Programming is a method of solving problems that exhibit the properties of overlapping subproblems and optimal substructure. The term was originally used in the 1940s by Richard Bellman. + +The word "programming" in "dynamic programming" has no particular connection to computer programming at all, and instead comes from the term "mathematical programming", a synonym for optimization. The "program" is the optimal plan for action that is produced. + +For instance, a finalized schedule of events at an exhibition is sometimes called a program. Programming, in this sense, means finding an acceptable plan of action. + +To say that a problem has overlapping subproblems is to say that the same subproblems are used to solve many different larger problems. Optimal substructure means that optimal solutions of subproblems can be used to find the optimal solutions of the overall problem. \ No newline at end of file diff --git a/data/g3pC_taska.txt b/data/g3pC_taska.txt new file mode 100755 index 0000000..b0a3f3a --- /dev/null +++ b/data/g3pC_taska.txt @@ -0,0 +1,15 @@ +In object-oriented programming, inheritance is a way to form new classes +(instances of which are called objects) using classes that have already +been defined. + +Inheritance is also sometimes called generalization, because the is-a +relationships represent a hierarchy between classes of objects. For +instance, a "fruit" is a generalization of "apple", "orange", "mango" +and many others. One can consider fruit to be an abstraction of apple, +orange, etc. Conversely, since apples are fruit (i.e., an apple is-a +fruit), apples may naturally inherit all the properties common to all +ruit, such as being a fleshy container for the seed of a plant. + +Inheritance is typically accomplished either by overriding (replacing) +one or more methods exposed by ancestor, or by adding new methods to +those exposed by an ancestor. \ No newline at end of file diff --git a/data/g3pC_taskb.txt b/data/g3pC_taskb.txt new file mode 100755 index 0000000..024f3ca --- /dev/null +++ b/data/g3pC_taskb.txt @@ -0,0 +1,20 @@ +The PageRank algorithm used by google harnesses the implicit collective +intelligence present in the structure of the world wide web. Any page on +the Internet will generally link to at least one other, by modelling this +link structure as a graph, we can build up a symbolic representation of +the world wide web. + +As the basic level, the nodes with the highest degrees can be considered +the most "popular" and by inference the most important - which can be used +to rank the pages when returning search results. + +Expanding on this theory, we can then say that the links from an important +pages are themselves more important. Using this idea we can adjust the +rankings of our pages so that pages linked to be the most important pages +are considered more relevant. + +The actual Google PageRank algorithm is much more complex than this, but +follows the same underlying principles. It incorporates some more advanced +reasoning to avoid website creators exploiting their knowledge of the algorithm +to artificially increase their PageRank through use of web-rings and other +similar reciprocal hyperlinking schemes. \ No newline at end of file diff --git a/data/g3pC_taskc.txt b/data/g3pC_taskc.txt new file mode 100755 index 0000000..5d34c20 --- /dev/null +++ b/data/g3pC_taskc.txt @@ -0,0 +1,10 @@ +Using the vector space model for Information Retrieval models all pages +and queries as high-dimensional sparse vectors. Each item in the vector +represents a different keyword. + +The similiarity betweeen two pages or a query and a page can be computed +using the dot product formula to find the cosine between them. This +represents the angle between them, but in n-dimensional space. Results +will range from -1 to 1, with 1 being a close match. Normally the vectors +will not have any negative values, so results will always be greater than +or equal to 0. The cosine is computed using: cos A = (|a||b|)/(a.b) \ No newline at end of file diff --git a/data/g3pC_taskd.txt b/data/g3pC_taskd.txt new file mode 100755 index 0000000..f7c7484 --- /dev/null +++ b/data/g3pC_taskd.txt @@ -0,0 +1,13 @@ +In probability theory, the prior and conditional probabilities +of two random events are related by Bayes' theorem. The theorem is +often used when we have observations and wish to compute posterior +probabilities. + +For example, given an observation that a patient is seen to have certain +symptoms, we can use Bayes' theorem to compute the probability that a +suggested diagnosis is correct. + +P(A) is the prior probability of A. P(A|B) is the conditional probabilty +of A given B. P(B|A) is the conditional probabilty of B given A. P(B) is +the prior probability of B, and must be non-zero. Bayes' theorem is given +by P(A|B) = (P(B|A)P(A))/(P(B)). \ No newline at end of file diff --git a/data/g3pC_taske.txt b/data/g3pC_taske.txt new file mode 100755 index 0000000..3692426 --- /dev/null +++ b/data/g3pC_taske.txt @@ -0,0 +1,13 @@ +In computer science and mathematics, dynamic programming +is a method of problem solving that utilises the properties +of overlapping subproblems and optimal substructure. And thus +the method takes much less time than more naive methods. + +In "dynamic programming", the word "programming" has no +real connection to computer programming at all, it actually +comes from the term "mathematical programming", +a synonym for optimisation. Thus, the "program" is the optimal +plan of action that is being produced. For example, a +schedule of events at an exhibition is sometimes called a +programme. Programming, in this sense, means finding an +acceptable plan, an algorithm. \ No newline at end of file diff --git a/data/g4pB_taska.txt b/data/g4pB_taska.txt new file mode 100755 index 0000000..4244b8d --- /dev/null +++ b/data/g4pB_taska.txt @@ -0,0 +1,3 @@ +Inheritance is the ability of a subclass to inherit default, protected and public attributes and methods from its superclasses. Each object (except java.lang.Object) can be cast to an object of one of its superclasses. However an object cannot be cast to a class which is no relative of it. Here is an example of inheritance: +We have the class of all living things which have attributes like weight and age. We have the classes of animals, plants, viruses and fungi that are subclasses of the class of all living things. The animals have their unique attributes (organs, hair, etc.) and methods (walking, mating, etc.). They also inherit the attributes and methods of its superclass. Animals can be treated (cast) to living things. However, animals cannot be treated as fungi. +In object oriented programming inheritance is also dependant on access level modifiers. For example private attributes and methods cannot be inherited. Virtual attributes and methods can be shadowed/overridden. In Java all attributes and methods are implicitly virtual. Object variable can store a reference to the same class or a subclass (i.e. this or more specialised version). However, object variables cannot store references to a superclass (i.e. less specialised version) of the original class. diff --git a/data/g4pB_taskb.txt b/data/g4pB_taskb.txt new file mode 100755 index 0000000..90fb4fa --- /dev/null +++ b/data/g4pB_taskb.txt @@ -0,0 +1 @@ +Page rank algorithm is used to determine a webpages importance or relevance in the web dependant on certain criteria. The criteria may include numbers of word matches with the search terms, number of other webpages that link this one and/or cite it as a source, number of unique visits for certain amount of time etc. There are some techniques that try to fool the search engines like link farms, keyword spamming and a lot of meta tags. The last two are somewhat easier to be dealt with (simply by being ignored most of the time). Link farms are groups of sites that are producing links between each other pursuing higher link counts. The reason for such manipulations is the pursuit of higher page rank so even higher number of users will see the page which will lead to higher income. Link farms can be exploited by joining to them and get inbound linkage but refuse to add links for ones own site to the sites from the link farm. Googles toolbar tries to follow the intentional user model by counting the visits from actual users (i.e. not computer bots) to a website. Page ranks can be calculated either recursively or iteratively. One of the most important uses of page rank is its meaning to advertising. diff --git a/data/g4pB_taskc.txt b/data/g4pB_taskc.txt new file mode 100755 index 0000000..ee97113 --- /dev/null +++ b/data/g4pB_taskc.txt @@ -0,0 +1,12 @@ +The vector space model (or term vector model) is an algebraic model for representing text documents (and any objects, in general) as vectors of identifiers, such as index terms. It is used in information filtering, information retrieval, indexing and relevancy rankings. It was used in the first time in the SMART Information Retrieval System. + +A document is represented as a vector. Each and every dimension corresponds to a separate term. If a term exists in a document, its value in the vector is not equal to zero. A couple of different algorithms of computing these values, also known as (term) weights, have been created. One of the most popular schemes is tf-idf weighting. + +The definition of term is dependent on the application. Typically terms are keywords, single words or longer phrases. Provided that words are selected to be the terms, the dimensionality of the vector is equal to the number of words in the vocabulary. + + +It is easiest to calculate the cosinus of the angle between the vectors instead of the angle by the formula: + + cos(theta)=v1.v2/(||v1||||v2||) + +A null cosinus value says that the query and document vector were orthogonal and had no match which means that no term of the query was ever encountered in the document. diff --git a/data/g4pB_taskd.txt b/data/g4pB_taskd.txt new file mode 100755 index 0000000..c3057e0 --- /dev/null +++ b/data/g4pB_taskd.txt @@ -0,0 +1,15 @@ +In probability theory, Bayes' theorem (or Bayes' law after Rev Thomas Bayes) provides relation between the conditional and marginal probabilities of two random events. It is usually used to calculate posterior probabilities given observations. For example: a patient might be observed to show certain symptoms. Bayes' theorem could be used to compute the probability that a certain diagnosis is right, given that observation. + +Since it is a formal theorem, Bayes' theorem holds in all popular interpretations of probability. +Bayes' theorem relates the conditional and marginal probabilities of events a and b, where b has a non-vanishing probability: + + P(a|b) = P(a|b)P(a)/P(b) + +Terms in Bayes' theorem are named by a convention: + +P(A) is the prior probability or marginal probability of A. It does not take into account any information about B and therefore is considered prior. +P(A|B) is the conditional probability of A, given B. It it is derived from or depends upon the specified value of B. Usually it is called the posterior probability +P(B|A) is the conditional probability of B given A. +P(B) (a.k.a. the normalizing constant) is the prior or marginal probability of B. + +Obviously, Bayes' theorem describes the way in which one's assumptions about observing the event'a' are changed by having observed the event 'b'. diff --git a/data/g4pB_taske.txt b/data/g4pB_taske.txt new file mode 100755 index 0000000..3daabd9 --- /dev/null +++ b/data/g4pB_taske.txt @@ -0,0 +1,28 @@ +In mathematics and computer science, dynamic programming is a method of solving problems that exhibit the properties of overlapping subproblems and optimal substructure. + +The word "programming" in "dynamic programming" has no particular connection to computer programming at all, and instead comes from the term "mathematical programming", a synonym for optimization. Programming, in this sense, means finding an acceptable plan of action, an algorithm. + +Optimal substructure means that optimal solutions of subproblems can be used to find the optimal solutions of the overall problem. In general, we can solve a problem with optimal substructure using a three-step process: + + 1. Break the problem into smaller subproblems. + 2. Solve these problems optimally using this three-step process recursively. + 3. Use these optimal solutions to construct an optimal solution for the original problem. + +The subproblems are, themselves, solved by dividing them into sub-subproblems, and so on, until we reach some simple case that is solvable in constant time. + +To say that a problem has overlapping subproblems is to say that the same subproblems are used to solve many different larger problems. For example, in the Fibonacci sequence, F3 = F1 + F2 and F4 = F2 + F3 computing each number involves computing F2. Because both F3 and F4 are needed to compute F5, a naive approach to computing F5 may end up computing F2 twice or more. This applies whenever overlapping subproblems are present: a naive approach may waste time recomputing optimal solutions to subproblems it has already solved. + +In order to avoid this, we instead save the solutions to problems we have already solved. Then, if we need to solve the same problem later, we can retrieve and reuse our already-computed solution. If we are sure we won't need a particular solution anymore, we can throw it away to save space. In some cases, we can even compute the solutions to subproblems we know that we'll need in advance. + +dynamic programming makes use of: + + Overlapping subproblems + Optimal substructure + Memoization + +Dynamic programming usually takes one of two approaches: + + Top-down approach + Bottom-up approach + + diff --git a/data/g4pC_taska.txt b/data/g4pC_taska.txt new file mode 100755 index 0000000..fd139a5 --- /dev/null +++ b/data/g4pC_taska.txt @@ -0,0 +1,7 @@ +In object-oriented programming, inheritance is a way to form new classes (instances of which are called objects) using classes that have already been defined. The inheritance concept was invented in 1967 for Simula +Inheritance provides the support for representation by categorization in computer languages. Categorization is a powerful mechanism number of information processing, crucial to human learning by means of generalization and cognitive economy (less information needs to be stored about each specific entity, only its particularities). +The new classes, known as derived classes, take over (or inherit) attributes and behavior of the pre-existing classes, which are referred to as base classes (or ancestor classes). It is intended to help reuse existing code with little or no modification. +Inheritance is also sometimes called generalization, because the is-a relationships represent a hierarchy between classes of objects. For instance, a "fruit" is a generalization of "apple", "orange", "mango" and many others. One can consider fruit to be an abstraction of apple, orange, etc. Conversely, since apples are fruit (i.e., an apple is-a fruit), apples may naturally inherit all the properties common to all fruit, such as being a fleshy container for the seed of a plant. +An advantage of inheritance is that modules with sufficiently similar interfaces can share a lot of code, reducing the complexity of the program. Inheritance therefore has another view, a dual, called polymorphism, which describes many pieces of code being controlled by shared control code. +Inheritance is typically accomplished either by overriding (replacing) one or more methods exposed by ancestor, or by adding new methods to those exposed by an ancestor. +Complex inheritance, or inheritance used within a design that is not sufficiently mature, may lead to the Yo-yo problem. diff --git a/data/g4pC_taskb.txt b/data/g4pC_taskb.txt new file mode 100755 index 0000000..660919c --- /dev/null +++ b/data/g4pC_taskb.txt @@ -0,0 +1,6 @@ +Since the develop of the Web 2.0, Google as one of the most popular search engine in the world, there are many algorithms in the web search. Accordingly, implementations of link analysis algorithms will typical discount such “internal” links. The word computer can be exploited by web search engines such as Google. Thus, the web is just like a graph, and the PageRank, which is our first technique for analysing the link which is assigns to every node in +the web graph a numerical score between 0 and 1. Since the PageRank is the most important algorithms which is used in the Google engine. For example, there are four pages group: A, B, C and D. If every page link to A, then A’s PageRank value shoule be the total value of B, C and D . +PR(A) = PR(B) + PR(C) + PR(D) +Moreover, there is a q = 0.15 which is be use in the web page, like the general algorithm below: + +However, the disadvantage is of PageRank algorithm is that the renew system is too slow. diff --git a/data/g4pC_taskc.txt b/data/g4pC_taskc.txt new file mode 100755 index 0000000..32d62c4 --- /dev/null +++ b/data/g4pC_taskc.txt @@ -0,0 +1,6 @@ +The vector space model are the documents which are represented as “bags of words”.The basic idea is to represent each document as a vector of certain weighted word frequencies. In order to do so, the following parsing and extraction steps are needed. +1. Ignoring case, extract all unique words from the entire set of documents. +2. Eliminate non-content-bearing ``stopwords'' such as ``a'', ``and'', ``the'', etc. For sample lists of stopwords. +3. For each document, count the number of occurrences of each word. +4. Using heuristic or information-theoretic criteria, eliminate non-content-bearing ``high-frequency'' and ``low-frequency'' words. +5. After the above elimination, suppose unique words remain. Assign a unique identifier between and to each remaining word, and a unique identifier between and to each document. diff --git a/data/g4pC_taskd.txt b/data/g4pC_taskd.txt new file mode 100755 index 0000000..ca9066a --- /dev/null +++ b/data/g4pC_taskd.txt @@ -0,0 +1,9 @@ +In probability theory, Bayes' theorem relates the conditional and marginal probabilities of two random events. It is usually be used to compute posterior probabilities given observations. For instance, a patient may be observed to have certain symptoms. Bayes' theorem can be used to compute the probability that a proposed diagnosis is correct, given that observation. +As a formal theorem, Bayes' theorem is valid in all common interpretations of probability. However, it plays a central role in the debate around the foundations of statistics: frequentist and Bayesian interpretations disagree about the ways in which probabilities should be assigned in applications. The articles on Bayesian probability and frequentist probability discuss these debates in greater detail. Frequentists assign probabilities to random events according to their frequencies of occurrence or to subsets of populations as proportions of the whole. At the same time, Bayesians describe probabilities in terms of beliefs and degrees of uncertainty. +Bayes' theorem relates the conditional and marginal probabilities of events A and B, where B has a non-vanishing probability: +Each term in Bayes' theorem has a conventional name: +• P(A) is the prior probability or marginal probability of A. It is "prior" in the sense that it does not take into account any information about B. +• P(A|B) is the conditional probability of A, given B. It is also called the posterior probability because it is derived from or depends upon the specified value of B. +• P(B|A) is the conditional probability of B given A. +• P(B) is the prior or marginal probability of B, and acts as a normalizing constant. +Intuitively, Bayes' theorem in this form describes the way in which one's beliefs about observing 'A' are updated by having observed 'B'. diff --git a/data/g4pC_taske.txt b/data/g4pC_taske.txt new file mode 100755 index 0000000..3d34f97 --- /dev/null +++ b/data/g4pC_taske.txt @@ -0,0 +1,4 @@ +In mathematics and computer science, dynamic programming is a methodology of the solution of the problems that exhibit the properties of overlapping subproblems and optimal substructure (described below). The methodology takes much less time rather than naive methods. +The term was originally used during the 1940s by Richard Bellman to describe the process of solving problems where one needs to find the best decisions one after another. By 1953, he had refined this to the modern meaning. The field was founded as a systems analysis and engineering topic that is recognized by the IEEE. Bellman's contribution is remembered in the name of the Bellman equation, a central result of dynamic programmer, which restates an optimization problem in recursive form. +The word "programming" in "dynamic programming" has no particular connection to computer programming in general , and instead of this it comes from the term "mathematical programming", a synonym for optimization. Therefore, the "program" is the optimal plan for action that is produced. For example, a finalized schedule of events at an exhibition is sometimes called a program. +Optimal substructure means that optimal solutions of subproblems can be used to find the optimal solutions of the overall problem. For instance, the shortest path to a goal from a vertex in a graph can be found by first computing the shortest path to the goal from all adjacent vertices. After this, it is using this to pick the best overall path. In a word, we can solve a problem with optimal substructure using a three-step process. diff --git a/data/g4pD_taska.txt b/data/g4pD_taska.txt new file mode 100755 index 0000000..cedb441 --- /dev/null +++ b/data/g4pD_taska.txt @@ -0,0 +1,5 @@ +The idea of inheritance in OOP refers to the formation of new classes with the already existing classes. The concept of inheritance was basically formulated for Simula in 1967. +As a result, the newly created inherited or derived classes inherit the properties and behavior of the classes from which they are derived. These original classes are either called base classes or sometimes referred to as ancestor classes. +The idea of inheritance is to reuse the existing code with little or no modification at all. +The basic support provided by inheritance is that it represents by categorization in computer languages. The power mechanism number of information processing that is crucial to human learning by the means of generalization and cognitive economy is called categorization. Where generalization if the knowledge of specific entities and is applied to a wider group provided that belongs relation can be created. On the other hand cognitive economy is where less information needs to be stored about each specific entity except for some particularities. +There are examples where we can have modules with similar interfaces. The advantage that inheritance provides is that it makes such modules share a lot of code which consequently reduces the complexity of the program. diff --git a/data/g4pD_taskb.txt b/data/g4pD_taskb.txt new file mode 100755 index 0000000..7b7af9d --- /dev/null +++ b/data/g4pD_taskb.txt @@ -0,0 +1,6 @@ +PageRank is a probability distribution used to represent the likelihood that a person randomly clicking on links will arrive at any particular page. . It is assumed in several research papers that the distribution is evenly divided between all documents in the collection at the beginning of the computational process. PageRank can be calculated for collections of documents of any size The PageRank computations require several passes, called "iterations", through the collection to adjust approximate PageRank values to more closely reflect the theoretical true value. +A probability is expressed as a numeric value between 0 and 1. A 0. 5 probability is commonly expressed as a "50% chance" of something happening. Hence, a PageRank of 0.5 means there is a 50% chance that a person clicking on a random link will be directed to the document with the 0.5 PageRank. +Simplified algorithm +How PageRank Works +Assume a small universe of four web pages: A, B, C and D. The initial approximation of PageRank would be evenly divided between these four documents. Hence, each document would begin with an estimated PageRank of 0.25. +In the original form of PageRank initial values were simply 1. This meant that the sum of all pages was the total number of pages on the web. Later versions of PageRank (see the below formulas) would assume a probability distribution between 0 and 1. Here we're going to simply use a probability distribution hence the initial value of 0.25. diff --git a/data/g4pD_taskc.txt b/data/g4pD_taskc.txt new file mode 100755 index 0000000..84dd7f0 --- /dev/null +++ b/data/g4pD_taskc.txt @@ -0,0 +1,10 @@ +In vector space model, the documents from which the information is to be retrieved are represented as vectors. The term weighting indentifies the success or failure of the vector space method. Terms are basically the words or any indexing unit used to identify the contents of a text. Furthermore, a term weighting scheme plays an important role for the similarity measure. The similarity measures largely identify the retrieval efficiency of a particular information retrieval system. + This largely depends on formulas. Where the formulas depend only on the frequencies within the document and they not depend on inter-document frequencies. The main components of the formulas are as follows: +Binary: +Binary formula gives every word that appears in a document equal relevance. This can be useful when the number of times a word appears is not considered important. +Term frequency: +This formula counts how many times the term occurs in a document. The more times a term t occurs in document d the more likely it is that t is relevant to the document. Used alone, favors common words and long documents. This formula gives more credit to words that appears more frequently, but often too much credit. +Augmented normalized term frequency +This formula tries to give credit to any word that appears and then give some additional credit to words that appear frequently. +Logarithmic term frequency +Logarithms are a way to de-emphasize the e_ect of frequency. Literature proposes log and alternate log as the most used diff --git a/data/g4pD_taskd.txt b/data/g4pD_taskd.txt new file mode 100755 index 0000000..66d9f18 --- /dev/null +++ b/data/g4pD_taskd.txt @@ -0,0 +1 @@ +Bayes' Theorem is a simple mathematical formula used for calculating conditional probabilities. Bayes' Theorem is a theorem of probability theory originally stated by the Reverend Thomas Bayes. It figures prominently in subjectivist or Bayesian approaches to epistemology, statistics, and inductive logic. It can be seen as a way of understanding how the probability that a theory is true is affected by a new piece of evidence. It has been used in a wide variety of contexts, ranging from marine biology to the development of "Bayesian" spam blockers for email systems. In the philosophy of science, it has been used to try to clarify the relationship between theory and evidence. Many insights in the philosophy of science involving confirmation, falsification, the relation between science and pseudosience, and other topics can be made more precise, and sometimes extended or corrected, by using Bayes' Theorem. Subjectivists, who maintain that rational belief is governed by the laws of probability, lean heavily on conditional probabilities in their theories of evidence and their models of empirical learning. Bayes' Theorem is central to these enterprises both because it simplifies the calculation of conditional probabilities and because it clarifies significant features of subjectivist position. Indeed, the Theorem's central insight that a hypothesis is confirmed by any body of data that its truth renders probable is the cornerstone of all subjectivist methodology. \ No newline at end of file diff --git a/data/g4pD_taske.txt b/data/g4pD_taske.txt new file mode 100755 index 0000000..4c49a74 --- /dev/null +++ b/data/g4pD_taske.txt @@ -0,0 +1,4 @@ +Dynamic programming is a method of providing solutions to potential problems exhibiting the properties of overlapping sub problems and optimal structure. This is highly used in dynamic programming. The advantage being the less time consumption in comparison to other amateur methods. +It has to be kept in mind that the term programming in the field has got nothing to do with computer programming at all. On the other hand it is derived from the term mathematical programming which is a similar word used for optimization. Here by meaning that a program can be an optimal plan for the produced action. The typical example could be of a finalized schedule of events at an exhibition. This leads to the concept of programming being a helper in finding an acceptable plan of action, which can also be termed as an algorithm +The subproblems are, themselves, solved by dividing them into sub-subproblems, and so on, until we reach some simple case that is solvable in constant time. +Overlapping subproblems means that the same subproblems are used to solve many different larger problems. Example could be of Fibonacci sequence; F3 = F1 + F2 and F4 = F2 + F3 computing each number involves computing F2. Because both F3 and F4 are needed to compute F5, a naive approach to computing F5 may end up computing F2 twice or more. It means that whenever we encounter with overlapping subproblems, a naive approach may waste to,e recomputing optimal solutions to the already solved subproblems. diff --git a/data/g4pE_taska.txt b/data/g4pE_taska.txt new file mode 100755 index 0000000..65bdc09 --- /dev/null +++ b/data/g4pE_taska.txt @@ -0,0 +1,4 @@ +Object oriented programming is a style of programming that supports encapsulation, inheritance, and polymorphism. Inheritance means derived a new class from the base class. We can also say there are parents class and child classes in inheritance. Inheritance was firstly derived in 1967. +The child class has all the features of parents class or we can say the base class more over it may also include some additional features. Inheritance is used for modification and implementation new features in computer programming language.It is possible that child class has all the attributes of parents class but it is not possible that all the attributes of child class must have in base class or parent class. +I categorization in computer language also inheritance is a useful tool.categorization define as a powerful feature.it has been also used in generalisation and in human learning. In some areas less information need to be stored. +Generlisation also some time known as inheritance. The main reason behind this is a hierarchi structure of objects and classes. We can understand this mechanism by some examples: like fruit is aq main class and mangoes apple ,orange is child classs of the main class.So obviously inherit all the properties of fruit class. diff --git a/data/g4pE_taskb.txt b/data/g4pE_taskb.txt new file mode 100755 index 0000000..a19466e --- /dev/null +++ b/data/g4pE_taskb.txt @@ -0,0 +1,6 @@ +PageRankalgorithm is also known as link analysis algorithm. It has been used by google. The algorithm may be applied to any collection of entities with reciprocal quotations and hyperlinked set of documents, such as the World Wide Web, with the purpose of "measuring references. The name "PageRank" is a trademark of Google, and the PageRank process has been patented (U.S. Patent 6,285,999 ). The numerical weight that it assigns to any given element E is also called the PageRank of E and denoted by PR(E). +The name "PageRank" is a trademark of Google, and the PageRank process has been patented (U.S. Patent 6,285,999 ). However, the patent is assigned to Stanford University and not to Google. Google has exclusive license rights on the patent from Stanford University. +In other words, a PageRank results from a "ballot" among all the other pages on the World Wide Web about how important a page is. A hyperlink to a page counts as a vote of support. The PageRank of a page is defined recursively and depends on the number and PageRank metric of all pages that link to it ("incoming links"). +Numerous academic papers concerning PageRank have been published since Page and Brin's original paper.[4] In practice, the PageRank concept has proven to be vulnerable to manipulation, and extensive research has been devoted to identifying falsely inflated PageRank and ways to ignore links from documents with falsely inflated PageRank + + diff --git a/data/g4pE_taskc.txt b/data/g4pE_taskc.txt new file mode 100755 index 0000000..5e746f4 --- /dev/null +++ b/data/g4pE_taskc.txt @@ -0,0 +1,9 @@ +The definition of term depends on the application. Typically terms are single words, keywords, or longer phrases. If the words are chosen to be the terms, the dimensionality of the vector is the number of words in the vocabulary A document is represented as a vector. Each dimensions corresponds to a separate terms. If a term occurs in the document, its value in the vector is non-zero. +Relevancy rankings of documents in a keyword search can be calculated, using the assumptions of document similarities theory, by comparing the deviation of angles between each document vector and the original query vector where the query is represented as same kind of vector as the documents. +LIMITATION: +There is some limitation of vector space model. +Models based on and extending the vector space model include: + Generalized vector space model. + (enhanced) Topic-based Vector Space Model [1] (eTVSM) Extends the vector space model by removing the constraint that the term-vectors be orthogonal. In contrast to the generalized vector space model the (enhanced) Topic-based Vector Space Model does not depend on concurrence-based similarities between terms. The enhancement of the enhanced Topic-based Vector Space Model (compared to the not enhanced one) is a proposal on how to derive term-vectors from an Ontology. + + diff --git a/data/g4pE_taskd.txt b/data/g4pE_taskd.txt new file mode 100755 index 0000000..2c7cab4 --- /dev/null +++ b/data/g4pE_taskd.txt @@ -0,0 +1,23 @@ +"Bayes' Theorem" or "Bayes' Rule", or something called Bayesian reasoning +The Bayesian Conspiracy is a multinational, interdisciplinary, and shadowy group of scientists that controls publication, grants, tenure, and the illicit traffic in grad students. The best way to be accepted into the Bayesian Conspiracy is to join the Campus Crusade for Bayes in high school or college, and gradually work your way up to the inner circles. . +Bayes' Theorem + + +Let and be sets. Conditional probability requires that + (1) +where denotes intersection ("and"), and also that + (2) +Therefore, + (3) +Now, let + (4) +so is an event in and for , then + (5) + (6) +But this can be written + (7) +so + + + +This paper proposes a new measure called scaled inverse document frequency (SIDF) which evaluates the conditional specificity of query terms over a subset S of D and without making any assumption about term independence. S can be estimated from search results, OR searches, or computed from inverted index data. We have evaluated SIDF values from commercial search engines by submitting queries relevant to the financial investment domain. Results compare favorably across search engines and queries. Our approach has practical applications for `real-world scenarios like in Web Mining, Homeland Security, and keyword-driven marketing research scenarios. diff --git a/data/g4pE_taske.txt b/data/g4pE_taske.txt new file mode 100755 index 0000000..c0222ca --- /dev/null +++ b/data/g4pE_taske.txt @@ -0,0 +1,9 @@ + Dynamic programming is a method for efficiently solving a broad range of search and optimization problems which exhibit the characteristics of overlappling. Dynamic programming. Design technique, like divide-and-conquer method. + +The leading and most up-to-date textbook on the far-ranging algorithmic methododogy of Dynamic Programming, which can be used for optimal control, ... +The word Programming in the name has nothing to do with writing computer programs. Mathematicians use the word to describe a set of rules which anyone can follow to solve a problem. They do not have to be written in a computer language. +Dynamic programming was the brainchild of an American Mathematician, Richard Bellman, who described the way of solving problems where you need to find the best decisions one after another. In the forty-odd years since this development, the number of uses and applications of dynamic programming has increased enormously. +For example, in 1982 David Kohler used dynamic programming to analyse the best way to play the game of darts. + +1. In recent years, dynamic programming languages develope very fastly, especially PHP and Ruby. There is no doubt that They have already became the first choice for many programmerers when developing web applications..When you learn a new natural language and you start to use it you naturally, you find yourself using new concepts and paradigms that enrich the use of the language you already know; expect the same result with computer languages. + diff --git a/data/orig_taska.txt b/data/orig_taska.txt new file mode 100755 index 0000000..27dd14e --- /dev/null +++ b/data/orig_taska.txt @@ -0,0 +1,12 @@ +In object-oriented programming, inheritance is a way to form new classes (instances of which are called objects) using classes that have already been defined. The inheritance concept was invented in 1967 for Simula. + +The new classes, known as derived classes, take over (or inherit) attributes and behavior of the pre-existing classes, which are referred to as base classes (or ancestor classes). It is intended to help reuse existing code with little or no modification. + +Inheritance provides the support for representation by categorization in computer languages. Categorization is a powerful mechanism number of information processing, crucial to human learning by means of generalization (what is known about specific entities is applied to a wider group given a belongs relation can be established) and cognitive economy (less information needs to be stored about each specific entity, only its particularities). + +Inheritance is also sometimes called generalization, because the is-a relationships represent a hierarchy between classes of objects. For instance, a "fruit" is a generalization of "apple", "orange", "mango" and many others. One can consider fruit to be an abstraction of apple, orange, etc. Conversely, since apples are fruit (i.e., an apple is-a fruit), apples may naturally inherit all the properties common to all fruit, such as being a fleshy container for the seed of a plant. + +An advantage of inheritance is that modules with sufficiently similar interfaces can share a lot of code, reducing the complexity of the program. Inheritance therefore has another view, a dual, called polymorphism, which describes many pieces of code being controlled by shared control code. +Inheritance is typically accomplished either by overriding (replacing) one or more methods exposed by ancestor, or by adding new methods to those exposed by an ancestor. + +Complex inheritance, or inheritance used within a design that is not sufficiently mature, may lead to the Yo-yo problem. \ No newline at end of file diff --git a/data/orig_taskb.txt b/data/orig_taskb.txt new file mode 100755 index 0000000..e7d76f3 --- /dev/null +++ b/data/orig_taskb.txt @@ -0,0 +1,8 @@ +PageRank is a link analysis algorithm used by the Google Internet search engine that assigns a numerical weighting to each element of a hyperlinked set of documents, such as the World Wide Web, with the purpose of "measuring" its relative importance within the set. The algorithm may be applied to any collection of entities with reciprocal quotations and references. The numerical weight that it assigns to any given element E is also called the PageRank of E and denoted by PR(E). +The name "PageRank" is a trademark of Google, and the PageRank process has been patented (U.S. Patent 6,285,999 ). However, the patent is assigned to Stanford University and not to Google. Google has exclusive license rights on the patent from Stanford University. The university received 1.8 million shares in Google in exchange for use of the patent; the shares were sold in 2005 for $336 million. +Google describes PageRank: +“ PageRank relies on the uniquely democratic nature of the web by using its vast link structure as an indicator of an individual page's value. In essence, Google interprets a link from page A to page B as a vote, by page A, for page B. But, Google looks at more than the sheer volume of votes, or links a page receives; it also analyzes the page that casts the vote. Votes cast by pages that are themselves "important" weigh more heavily and help to make other pages "important". ” +In other words, a PageRank results from a "ballot" among all the other pages on the World Wide Web about how important a page is. A hyperlink to a page counts as a vote of support. The PageRank of a page is defined recursively and depends on the number and PageRank metric of all pages that link to it ("incoming links"). A page that is linked to by many pages with high PageRank receives a high rank itself. If there are no links to a web page there is no support for that page. +Google assigns a numeric weighting from 0-10 for each webpage on the Internet; this PageRank denotes a site’s importance in the eyes of Google. The PageRank is derived from a theoretical probability value on a logarithmic scale like the Richter Scale. The PageRank of a particular page is roughly based upon the quantity of inbound links as well as the PageRank of the pages providing the links. It is known that other factors, e.g. relevance of search words on the page and actual visits to the page reported by the Google toolbar also influence the PageRank. In order to prevent manipulation, spoofing and Spamdexing, Google provides no specific details about how other factors influence PageRank. +Numerous academic papers concerning PageRank have been published since Page and Brin's original paper. In practice, the PageRank concept has proven to be vulnerable to manipulation, and extensive research has been devoted to identifying falsely inflated PageRank and ways to ignore links from documents with falsely inflated PageRank. +Other link-based ranking algorithms for Web pages include the HITS algorithm invented by Jon Kleinberg (used by Teoma and now Ask.com), the IBM CLEVER project, and the TrustRank algorithm. diff --git a/data/orig_taskc.txt b/data/orig_taskc.txt new file mode 100755 index 0000000..59f1e31 --- /dev/null +++ b/data/orig_taskc.txt @@ -0,0 +1,8 @@ +Vector space model (or term vector model) is an algebraic model for representing text documents (and any objects, in general) as vectors of identifiers, such as, for example, index terms. It is used in information filtering, information retrieval, indexing and relevancy rankings. Its first use was in the SMART Information Retrieval System. +A document is represented as a vector. Each dimension corresponds to a separate term. If a term occurs in the document, its value in the vector is non-zero. Several different ways of computing these values, also known as (term) weights, have been developed. One of the best known schemes is tf-idf weighting (see the example below). +The definition of term depends on the application. Typically terms are single words, keywords, or longer phrases. If the words are chosen to be the terms, the dimensionality of the vector is the number of words in the vocabulary (the number of distinct words occurring in the corpus). +The vector space model has the following limitations: + 1. Long documents are poorly represented because they have poor similarity values (a small scalar product and a large dimensionality) + 2. Search keywords must precisely match document terms; word substrings might result in a "false positive match" + 3. Semantic sensitivity; documents with similar context but different term vocabulary won't be associated, resulting in a "false negative match". + 4. The order in which the terms appear in the document is lost in the vector space representation. diff --git a/data/orig_taskd.txt b/data/orig_taskd.txt new file mode 100755 index 0000000..0788cc8 --- /dev/null +++ b/data/orig_taskd.txt @@ -0,0 +1,10 @@ +In probability theory, Bayes' theorem (often called Bayes' law after Rev Thomas Bayes) relates the conditional and marginal probabilities of two random events. It is often used to compute posterior probabilities given observations. For example, a patient may be observed to have certain symptoms. Bayes' theorem can be used to compute the probability that a proposed diagnosis is correct, given that observation. (See example 2) +As a formal theorem, Bayes' theorem is valid in all common interpretations of probability. However, it plays a central role in the debate around the foundations of statistics: frequentist and Bayesian interpretations disagree about the ways in which probabilities should be assigned in applications. Frequentists assign probabilities to random events according to their frequencies of occurrence or to subsets of populations as proportions of the whole, while Bayesians describe probabilities in terms of beliefs and degrees of uncertainty. The articles on Bayesian probability and frequentist probability discuss these debates in greater detail. +Bayes' theorem relates the conditional and marginal probabilities of events A and B, where B has a non-vanishing probability: + P(A|B) = \frac{P(B | A)\, P(A)}{P(B)}. +Each term in Bayes' theorem has a conventional name: + * P(A) is the prior probability or marginal probability of A. It is "prior" in the sense that it does not take into account any information about B. + * P(A|B) is the conditional probability of A, given B. It is also called the posterior probability because it is derived from or depends upon the specified value of B. + * P(B|A) is the conditional probability of B given A. + * P(B) is the prior or marginal probability of B, and acts as a normalizing constant. +Intuitively, Bayes' theorem in this form describes the way in which one's beliefs about observing 'A' are updated by having observed 'B'. diff --git a/data/orig_taske.txt b/data/orig_taske.txt new file mode 100755 index 0000000..a1be673 --- /dev/null +++ b/data/orig_taske.txt @@ -0,0 +1,11 @@ +In mathematics and computer science, dynamic programming is a method of solving problems that exhibit the properties of overlapping subproblems and optimal substructure (described below). The method takes much less time than naive methods. +The term was originally used in the 1940s by Richard Bellman to describe the process of solving problems where one needs to find the best decisions one after another. By 1953, he had refined this to the modern meaning. The field was founded as a systems analysis and engineering topic that is recognized by the IEEE. Bellman's contribution is remembered in the name of the Bellman equation, a central result of dynamic programming which restates an optimization problem in recursive form. +The word "programming" in "dynamic programming" has no particular connection to computer programming at all, and instead comes from the term "mathematical programming", a synonym for optimization. Thus, the "program" is the optimal plan for action that is produced. For instance, a finalized schedule of events at an exhibition is sometimes called a program. Programming, in this sense, means finding an acceptable plan of action, an algorithm. +Optimal substructure means that optimal solutions of subproblems can be used to find the optimal solutions of the overall problem. For example, the shortest path to a goal from a vertex in a graph can be found by first computing the shortest path to the goal from all adjacent vertices, and then using this to pick the best overall path, as shown in Figure 1. In general, we can solve a problem with optimal substructure using a three-step process: + 1. Break the problem into smaller subproblems. + 2. Solve these problems optimally using this three-step process recursively. + 3. Use these optimal solutions to construct an optimal solution for the original problem. +The subproblems are, themselves, solved by dividing them into sub-subproblems, and so on, until we reach some simple case that is solvable in constant time. +Figure 2. The subproblem graph for the Fibonacci sequence. That it is not a tree but a DAG indicates overlapping subproblems. +To say that a problem has overlapping subproblems is to say that the same subproblems are used to solve many different larger problems. For example, in the Fibonacci sequence, F3 = F1 + F2 and F4 = F2 + F3 — computing each number involves computing F2. Because both F3 and F4 are needed to compute F5, a naive approach to computing F5 may end up computing F2 twice or more. This applies whenever overlapping subproblems are present: a naive approach may waste time recomputing optimal solutions to subproblems it has already solved. +In order to avoid this, we instead save the solutions to problems we have already solved. Then, if we need to solve the same problem later, we can retrieve and reuse our already-computed solution. This approach is called memoization (not memorization, although this term also fits). If we are sure we won't need a particular solution anymore, we can throw it away to save space. In some cases, we can even compute the solutions to subproblems we know that we'll need in advance. diff --git a/data/test_info.csv b/data/test_info.csv new file mode 100644 index 0000000..b654fe2 --- /dev/null +++ b/data/test_info.csv @@ -0,0 +1,43 @@ +File,Task,Category +g0pB_taske.txt,e,heavy +g0pC_taska.txt,a,heavy +g0pC_taskb.txt,b,non +g0pC_taskc.txt,c,non +g0pC_taskd.txt,d,cut +g0pC_taske.txt,e,light +g1pB_taskb.txt,b,non +g1pB_taskc.txt,c,heavy +g1pB_taskd.txt,d,light +g1pB_taske.txt,e,cut +g1pD_taska.txt,a,light +g1pD_taskb.txt,b,cut +g3pB_taskd.txt,d,light +g3pB_taske.txt,e,cut +g3pC_taska.txt,a,cut +g3pC_taskb.txt,b,non +g3pC_taskc.txt,c,non +g3pC_taskd.txt,d,heavy +g3pC_taske.txt,e,light +g4pB_taska.txt,a,non +g4pB_taskb.txt,b,non +g4pB_taskc.txt,c,heavy +g4pB_taskd.txt,d,light +g4pB_taske.txt,e,cut +g4pC_taska.txt,a,cut +g4pC_taskb.txt,b,non +g4pC_taskc.txt,c,non +g4pC_taskd.txt,d,heavy +g4pC_taske.txt,e,light +g4pD_taska.txt,a,light +g4pD_taskb.txt,b,cut +g4pD_taskc.txt,c,non +g4pD_taskd.txt,d,non +g4pD_taske.txt,e,heavy +g4pE_taska.txt,a,heavy +g4pE_taskb.txt,b,light +g4pE_taske.txt,e,non +orig_taska.txt,a,orig +orig_taskb.txt,b,orig +orig_taskc.txt,c,orig +orig_taskd.txt,d,orig +orig_taske.txt,e,orig \ No newline at end of file diff --git a/helpers.py b/helpers.py new file mode 100644 index 0000000..c323f13 --- /dev/null +++ b/helpers.py @@ -0,0 +1,112 @@ +import re +import pandas as pd +import operator + +# Add 'datatype' column that indicates if the record is original wiki answer as 0, training data 1, test data 2, onto +# the dataframe - uses stratified random sampling (with seed) to sample by task & plagiarism amount + +# Use function to label datatype for training 1 or test 2 +def create_datatype(df, train_value, test_value, datatype_var, compare_dfcolumn, operator_of_compare, value_of_compare, + sampling_number, sampling_seed): + # Subsets dataframe by condition relating to statement built from: + # 'compare_dfcolumn' 'operator_of_compare' 'value_of_compare' + df_subset = df[operator_of_compare(df[compare_dfcolumn], value_of_compare)] + df_subset = df_subset.drop(columns = [datatype_var]) + + # Prints counts by task and compare_dfcolumn for subset df + #print("\nCounts by Task & " + compare_dfcolumn + ":\n", df_subset.groupby(['Task', compare_dfcolumn]).size().reset_index(name="Counts") ) + + # Sets all datatype to value for training for df_subset + df_subset.loc[:, datatype_var] = train_value + + # Performs stratified random sample of subset dataframe to create new df with subset values + df_sampled = df_subset.groupby(['Task', compare_dfcolumn], group_keys=False).apply(lambda x: x.sample(min(len(x), sampling_number), random_state = sampling_seed)) + df_sampled = df_sampled.drop(columns = [datatype_var]) + # Sets all datatype to value for test_value for df_sampled + df_sampled.loc[:, datatype_var] = test_value + + # Prints counts by compare_dfcolumn for selected sample + #print("\nCounts by "+ compare_dfcolumn + ":\n", df_sampled.groupby([compare_dfcolumn]).size().reset_index(name="Counts") ) + #print("\nSampled DF:\n",df_sampled) + + # Labels all datatype_var column as train_value which will be overwritten to + # test_value in next for loop for all test cases chosen with stratified sample + for index in df_sampled.index: + # Labels all datatype_var columns with test_value for straified test sample + df_subset.loc[index, datatype_var] = test_value + + #print("\nSubset DF:\n",df_subset) + # Adds test_value and train_value for all relevant data in main dataframe + for index in df_subset.index: + # Labels all datatype_var columns in df with train_value/test_value based upon + # stratified test sample and subset of df + df.loc[index, datatype_var] = df_subset.loc[index, datatype_var] + + # returns nothing because dataframe df already altered + +def train_test_dataframe(clean_df, random_seed=100): + + new_df = clean_df.copy() + + # Initialize datatype as 0 initially for all records - after function 0 will remain only for original wiki answers + new_df.loc[:,'Datatype'] = 0 + + # Creates test & training datatypes for plagiarized answers (1,2,3) + create_datatype(new_df, 1, 2, 'Datatype', 'Category', operator.gt, 0, 1, random_seed) + + # Creates test & training datatypes for NON-plagiarized answers (0) + create_datatype(new_df, 1, 2, 'Datatype', 'Category', operator.eq, 0, 2, random_seed) + + # creating a dictionary of categorical:numerical mappings for plagiarsm categories + mapping = {0:'orig', 1:'train', 2:'test'} + + # traversing through dataframe and replacing categorical data + new_df.Datatype = [mapping[item] for item in new_df.Datatype] + + return new_df + + +# helper function for pre-processing text given a file +def process_file(file): + # put text in all lower case letters + all_text = file.read().lower() + + # remove all non-alphanumeric chars + all_text = re.sub(r"[^a-zA-Z0-9]", " ", all_text) + # remove newlines/tabs, etc. so it's easier to match phrases, later + all_text = re.sub(r"\t", " ", all_text) + all_text = re.sub(r"\n", " ", all_text) + all_text = re.sub(" ", " ", all_text) + all_text = re.sub(" ", " ", all_text) + + return all_text + + +def create_text_column(df, file_directory='data/'): + '''Reads in the files, listed in a df and returns that df with an additional column, `Text`. + :param df: A dataframe of file information including a column for `File` + :param file_directory: the main directory where files are stored + :return: A dataframe with processed text ''' + + # create copy to modify + text_df = df.copy() + + # store processed text + text = [] + + # for each file (row) in the df, read in the file + for row_i in df.index: + filename = df.iloc[row_i]['File'] + #print(filename) + file_path = file_directory + filename + with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: + + # standardize text using helper function + file_text = process_file(file) + # append processed text to list + text.append(file_text) + + # add column to the copied dataframe + text_df['Text'] = text + + return text_df diff --git a/notebook_ims/common_subseq_words.png b/notebook_ims/common_subseq_words.png new file mode 100644 index 0000000..75b4e80 Binary files /dev/null and b/notebook_ims/common_subseq_words.png differ diff --git a/notebook_ims/matrix_1.png b/notebook_ims/matrix_1.png new file mode 100644 index 0000000..34ef6da Binary files /dev/null and b/notebook_ims/matrix_1.png differ diff --git a/notebook_ims/matrix_2.png b/notebook_ims/matrix_2.png new file mode 100644 index 0000000..d0861b4 Binary files /dev/null and b/notebook_ims/matrix_2.png differ diff --git a/notebook_ims/matrix_3_match.png b/notebook_ims/matrix_3_match.png new file mode 100644 index 0000000..0a7feb9 Binary files /dev/null and b/notebook_ims/matrix_3_match.png differ diff --git a/notebook_ims/matrix_6_complete.png b/notebook_ims/matrix_6_complete.png new file mode 100644 index 0000000..8fe9ac4 Binary files /dev/null and b/notebook_ims/matrix_6_complete.png differ diff --git a/notebook_ims/matrix_rules.png b/notebook_ims/matrix_rules.png new file mode 100644 index 0000000..778a589 Binary files /dev/null and b/notebook_ims/matrix_rules.png differ diff --git a/plagiarism_data/test.csv b/plagiarism_data/test.csv new file mode 100644 index 0000000..28c2f39 --- /dev/null +++ b/plagiarism_data/test.csv @@ -0,0 +1,25 @@ +1,1.0,0.9222797927461139,0.8207547169811321 +1,0.7653061224489796,0.5896551724137931,0.6217105263157895 +1,0.8844444444444445,0.18099547511312217,0.597457627118644 +1,0.6190476190476191,0.043243243243243246,0.42783505154639173 +1,0.92,0.39436619718309857,0.775 +1,0.9926739926739927,0.9739776951672863,0.9930555555555556 +0,0.4126984126984127,0.0,0.3466666666666667 +0,0.4626865671641791,0.0,0.18932038834951456 +0,0.581151832460733,0.0,0.24742268041237114 +0,0.5842105263157895,0.0,0.29441624365482233 +0,0.5663716814159292,0.0,0.25833333333333336 +0,0.48148148148148145,0.022900763358778626,0.2789115646258503 +1,0.6197916666666666,0.026595744680851064,0.3415841584158416 +1,0.9217391304347826,0.6548672566371682,0.9294117647058824 +1,1.0,0.9224806201550387,1.0 +1,0.8615384615384616,0.06282722513089005,0.5047169811320755 +1,0.6261682242990654,0.22397476340694006,0.5585585585585585 +1,1.0,0.9688715953307393,0.9966996699669967 +0,0.3838383838383838,0.010309278350515464,0.178743961352657 +1,1.0,0.9446494464944649,0.8546712802768166 +0,0.6139240506329114,0.0,0.2983425414364641 +1,0.9727626459143969,0.8300395256916996,0.9270833333333334 +1,0.9628099173553719,0.6890756302521008,0.9098039215686274 +0,0.4152542372881356,0.0,0.1774193548387097 +0,0.5321888412017167,0.017467248908296942,0.24583333333333332 diff --git a/plagiarism_data/train.csv b/plagiarism_data/train.csv new file mode 100644 index 0000000..5c7b8bb --- /dev/null +++ b/plagiarism_data/train.csv @@ -0,0 +1,70 @@ +0,0.39814814814814814,0.0,0.1917808219178082 +1,0.8693693693693694,0.44954128440366975,0.8464912280701754 +1,0.5935828877005348,0.08196721311475409,0.3160621761658031 +0,0.5445026178010471,0.0,0.24257425742574257 +0,0.32950191570881227,0.0,0.16117216117216118 +0,0.5903083700440529,0.0,0.30165289256198347 +1,0.7597765363128491,0.24571428571428572,0.484304932735426 +0,0.5161290322580645,0.0,0.2708333333333333 +0,0.44086021505376344,0.0,0.22395833333333334 +1,0.9794520547945206,0.7887323943661971,0.9 +1,0.9513888888888888,0.5214285714285715,0.8940397350993378 +1,0.9764705882352941,0.5783132530120482,0.8232044198895028 +1,0.8117647058823529,0.28313253012048195,0.45977011494252873 +0,0.4411764705882353,0.0,0.3055555555555556 +0,0.4888888888888889,0.0,0.2826086956521739 +1,0.813953488372093,0.6341463414634146,0.7888888888888889 +0,0.6111111111111112,0.0,0.3246753246753247 +1,1.0,0.9659090909090909,1.0 +1,0.634020618556701,0.005263157894736842,0.36893203883495146 +1,0.5829383886255924,0.08695652173913043,0.4166666666666667 +1,0.6379310344827587,0.30701754385964913,0.4898785425101215 +0,0.42038216560509556,0.0,0.21875 +1,0.6877637130801688,0.07725321888412018,0.5163934426229508 +1,0.6766467065868264,0.11042944785276074,0.4725274725274725 +1,0.7692307692307693,0.45084745762711864,0.6064516129032258 +1,0.7122641509433962,0.08653846153846154,0.536697247706422 +1,0.6299212598425197,0.28,0.39436619718309857 +1,0.7157360406091371,0.0051813471502590676,0.3431372549019608 +0,0.3320610687022901,0.0,0.15302491103202848 +1,0.7172131147540983,0.07916666666666666,0.4559386973180077 +1,0.8782608695652174,0.47345132743362833,0.82 +1,0.5298013245033113,0.31543624161073824,0.45 +0,0.5721153846153846,0.0,0.22935779816513763 +0,0.319672131147541,0.0,0.16535433070866143 +0,0.53,0.0,0.26046511627906976 +1,0.78,0.6071428571428571,0.6699029126213593 +0,0.6526946107784432,0.0,0.3551912568306011 +0,0.4439461883408072,0.0,0.23376623376623376 +1,0.6650246305418719,0.18090452261306533,0.3492647058823529 +1,0.7281553398058253,0.034653465346534656,0.3476190476190476 +1,0.7620481927710844,0.2896341463414634,0.5677233429394812 +1,0.9470198675496688,0.2857142857142857,0.774390243902439 +1,0.3684210526315789,0.0,0.19298245614035087 +0,0.5328947368421053,0.0,0.21818181818181817 +0,0.6184971098265896,0.005917159763313609,0.26666666666666666 +0,0.5103092783505154,0.010526315789473684,0.22110552763819097 +0,0.5798319327731093,0.0,0.2289156626506024 +0,0.40703517587939697,0.0,0.1722488038277512 +0,0.5154639175257731,0.0,0.23684210526315788 +1,0.5845410628019324,0.04926108374384237,0.29493087557603687 +1,0.6171875,0.1693548387096774,0.5037593984962406 +1,1.0,0.84251968503937,0.9117647058823529 +1,0.9916666666666667,0.8879310344827587,0.9923076923076923 +0,0.550561797752809,0.0,0.2833333333333333 +0,0.41935483870967744,0.0,0.2616822429906542 +1,0.8351648351648352,0.034482758620689655,0.6470588235294118 +1,0.9270833333333334,0.29347826086956524,0.85 +0,0.4928909952606635,0.0,0.2350230414746544 +1,0.7087378640776699,0.3217821782178218,0.6619718309859155 +1,0.8633879781420765,0.30726256983240224,0.7911111111111111 +1,0.9606060606060606,0.8650306748466258,0.9298245614035088 +0,0.4380165289256198,0.0,0.2230769230769231 +1,0.7336683417085427,0.07179487179487179,0.4900990099009901 +1,0.5138888888888888,0.0,0.25203252032520324 +0,0.4861111111111111,0.0,0.22767857142857142 +1,0.8451882845188284,0.3021276595744681,0.6437246963562753 +1,0.485,0.0,0.24271844660194175 +1,0.9506726457399103,0.7808219178082192,0.8395061728395061 +1,0.551219512195122,0.23383084577114427,0.2830188679245283 +0,0.3612565445026178,0.0,0.16176470588235295 diff --git a/problem_unittests.py b/problem_unittests.py new file mode 100644 index 0000000..3036160 --- /dev/null +++ b/problem_unittests.py @@ -0,0 +1,153 @@ +from unittest.mock import MagicMock, patch +import sklearn.naive_bayes +import numpy as np +import pandas as pd +import re + +# test csv file +TEST_CSV = 'data/test_info.csv' + +class AssertTest(object): + '''Defines general test behavior.''' + def __init__(self, params): + self.assert_param_message = '\n'.join([str(k) + ': ' + str(v) + '' for k, v in params.items()]) + + def test(self, assert_condition, assert_message): + assert assert_condition, assert_message + '\n\nUnit Test Function Parameters\n' + self.assert_param_message + +def _print_success_message(): + print('Tests Passed!') + +# test clean_dataframe +def test_numerical_df(numerical_dataframe): + + # test result + transformed_df = numerical_dataframe(TEST_CSV) + + # Check type is a DataFrame + assert isinstance(transformed_df, pd.DataFrame), 'Returned type is {}.'.format(type(transformed_df)) + + # check columns + column_names = list(transformed_df) + assert 'File' in column_names, 'No File column, found.' + assert 'Task' in column_names, 'No Task column, found.' + assert 'Category' in column_names, 'No Category column, found.' + assert 'Class' in column_names, 'No Class column, found.' + + # check conversion values + assert transformed_df.loc[0, 'Category'] == 1, '`heavy` plagiarism mapping test, failed.' + assert transformed_df.loc[2, 'Category'] == 0, '`non` plagiarism mapping test, failed.' + assert transformed_df.loc[30, 'Category'] == 3, '`cut` plagiarism mapping test, failed.' + assert transformed_df.loc[5, 'Category'] == 2, '`light` plagiarism mapping test, failed.' + assert transformed_df.loc[37, 'Category'] == -1, 'original file mapping test, failed; should have a Category = -1.' + assert transformed_df.loc[41, 'Category'] == -1, 'original file mapping test, failed; should have a Category = -1.' + + _print_success_message() + + +def test_containment(complete_df, containment_fn): + + # check basic format and value + # for n = 1 and just the fifth file + test_val = containment_fn(complete_df, 1, 'g0pA_taske.txt') + + assert isinstance(test_val, float), 'Returned type is {}.'.format(type(test_val)) + assert test_val<=1.0, 'It appears that the value is not normalized; expected a value <=1, got: '+str(test_val) + + # known vals for first few files + filenames = ['g0pA_taska.txt', 'g0pA_taskb.txt', 'g0pA_taskc.txt', 'g0pA_taskd.txt'] + ngram_1 = [0.39814814814814814, 1.0, 0.86936936936936937, 0.5935828877005348] + ngram_3 = [0.0093457943925233638, 0.96410256410256412, 0.61363636363636365, 0.15675675675675677] + + # results for comparison + results_1gram = [] + results_3gram = [] + + for i in range(4): + val_1 = containment_fn(complete_df, 1, filenames[i]) + val_3 = containment_fn(complete_df, 3, filenames[i]) + results_1gram.append(val_1) + results_3gram.append(val_3) + + # check correct results + assert all(np.isclose(results_1gram, ngram_1, rtol=1e-04)), \ + 'n=1 calculations are incorrect. Double check the intersection calculation.' + # check correct results + assert all(np.isclose(results_3gram, ngram_3, rtol=1e-04)), \ + 'n=3 calculations are incorrect.' + + _print_success_message() + +def test_lcs(df, lcs_word): + + test_index = 10 # file 10 + + # get answer file text + answer_text = df.loc[test_index, 'Text'] + + # get text for orig file + # find the associated task type (one character, a-e) + task = df.loc[test_index, 'Task'] + # we know that source texts have Class = -1 + orig_rows = df[(df['Class'] == -1)] + orig_row = orig_rows[(orig_rows['Task'] == task)] + source_text = orig_row['Text'].values[0] + + # calculate LCS + test_val = lcs_word(answer_text, source_text) + + # check type + assert isinstance(test_val, float), 'Returned type is {}.'.format(type(test_val)) + assert test_val<=1.0, 'It appears that the value is not normalized; expected a value <=1, got: '+str(test_val) + + # known vals for first few files + lcs_vals = [0.1917808219178082, 0.8207547169811321, 0.8464912280701754, 0.3160621761658031, 0.24257425742574257] + + # results for comparison + results = [] + + for i in range(5): + # get answer and source text + answer_text = df.loc[i, 'Text'] + task = df.loc[i, 'Task'] + # we know that source texts have Class = -1 + orig_rows = df[(df['Class'] == -1)] + orig_row = orig_rows[(orig_rows['Task'] == task)] + source_text = orig_row['Text'].values[0] + # calc lcs + val = lcs_word(answer_text, source_text) + results.append(val) + + # check correct results + assert all(np.isclose(results, lcs_vals, rtol=1e-05)), 'LCS calculations are incorrect.' + + _print_success_message() + +def test_data_split(train_x, train_y, test_x, test_y): + + # check types + assert isinstance(train_x, np.ndarray),\ + 'train_x is not an array, instead got type: {}'.format(type(train_x)) + assert isinstance(train_y, np.ndarray),\ + 'train_y is not an array, instead got type: {}'.format(type(train_y)) + assert isinstance(test_x, np.ndarray),\ + 'test_x is not an array, instead got type: {}'.format(type(test_x)) + assert isinstance(test_y, np.ndarray),\ + 'test_y is not an array, instead got type: {}'.format(type(test_y)) + + # should hold all 95 submission files + assert len(train_x) + len(test_x) == 95, \ + 'Unexpected amount of train + test data. Expecting 95 answer text files, got ' +str(len(train_x) + len(test_x)) + assert len(test_x) > 1, \ + 'Unexpected amount of test data. There should be multiple test files.' + + # check shape + assert train_x.shape[1]==2, \ + 'train_x should have as many columns as selected features, got: {}'.format(train_x.shape[1]) + assert len(train_y.shape)==1, \ + 'train_y should be a 1D array, got shape: {}'.format(train_y.shape) + + _print_success_message() + + + \ No newline at end of file diff --git a/source_pytorch/model.py b/source_pytorch/model.py new file mode 100644 index 0000000..21e1db7 --- /dev/null +++ b/source_pytorch/model.py @@ -0,0 +1,44 @@ +# torch imports +import torch.nn.functional as F +import torch.nn as nn + + +## TODO: Complete this classifier +class BinaryClassifier(nn.Module): + """ + Define a neural network that performs binary classification. + The network should accept your number of features as input, and produce + a single sigmoid value, that can be rounded to a label: 0 or 1, as output. + + Notes on training: + To train a binary classifier in PyTorch, use BCELoss. + BCELoss is binary cross entropy loss, documentation: https://pytorch.org/docs/stable/nn.html#torch.nn.BCELoss + """ + + ## TODO: Define the init function, the input params are required (for loading code in train.py to work) + def __init__(self, input_features, hidden_dim, output_dim): + """ + Initialize the model by setting up linear layers. + Use the input parameters to help define the layers of your model. + :param input_features: the number of input features in your training/test data + :param hidden_dim: helps define the number of nodes in the hidden layer(s) + :param output_dim: the number of outputs you want to produce + """ + super(BinaryClassifier, self).__init__() + + # define any initial layers, here + + + + ## TODO: Define the feedforward behavior of the network + def forward(self, x): + """ + Perform a forward pass of our model on input features, x. + :param x: A batch of input features of size (batch_size, input_features) + :return: A single, sigmoid-activated value as output + """ + + # define the feedforward behavior + + return x + \ No newline at end of file diff --git a/source_pytorch/predict.py b/source_pytorch/predict.py new file mode 100644 index 0000000..faf758e --- /dev/null +++ b/source_pytorch/predict.py @@ -0,0 +1,80 @@ +# import libraries +import os +import numpy as np +import torch +from six import BytesIO + +# import model from model.py, by name +from model import BinaryClassifier + +# default content type is numpy array +NP_CONTENT_TYPE = 'application/x-npy' + + +# Provided model load function +def model_fn(model_dir): + """Load the PyTorch model from the `model_dir` directory.""" + print("Loading model.") + + # First, load the parameters used to create the model. + model_info = {} + model_info_path = os.path.join(model_dir, 'model_info.pth') + with open(model_info_path, 'rb') as f: + model_info = torch.load(f) + + print("model_info: {}".format(model_info)) + + # Determine the device and construct the model. + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = BinaryClassifier(model_info['input_features'], model_info['hidden_dim'], model_info['output_dim']) + + # Load the store model parameters. + model_path = os.path.join(model_dir, 'model.pth') + with open(model_path, 'rb') as f: + model.load_state_dict(torch.load(f)) + + # Prep for testing + model.to(device).eval() + + print("Done loading model.") + return model + + +# Provided input data loading +def input_fn(serialized_input_data, content_type): + print('Deserializing the input data.') + if content_type == NP_CONTENT_TYPE: + stream = BytesIO(serialized_input_data) + return np.load(stream) + raise Exception('Requested unsupported ContentType in content_type: ' + content_type) + +# Provided output data handling +def output_fn(prediction_output, accept): + print('Serializing the generated output.') + if accept == NP_CONTENT_TYPE: + stream = BytesIO() + np.save(stream, prediction_output) + return stream.getvalue(), accept + raise Exception('Requested unsupported ContentType in Accept: ' + accept) + + +# Provided predict function +def predict_fn(input_data, model): + print('Predicting class labels for the input data...') + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Process input_data so that it is ready to be sent to our model. + data = torch.from_numpy(input_data.astype('float32')) + data = data.to(device) + + # Put the model into evaluation mode + model.eval() + + # Compute the result of applying the model to the input data + # The variable `out_label` should be a rounded value, either 1 or 0 + out = model(data) + out_np = out.cpu().detach().numpy() + out_label = out_np.round() + + return out_label \ No newline at end of file diff --git a/source_pytorch/train.py b/source_pytorch/train.py new file mode 100644 index 0000000..f0ff6ea --- /dev/null +++ b/source_pytorch/train.py @@ -0,0 +1,164 @@ +import argparse +import json +import os +import pandas as pd +import torch +import torch.optim as optim +import torch.utils.data + +# imports the model in model.py by name +from model import BinaryClassifier + +def model_fn(model_dir): + """Load the PyTorch model from the `model_dir` directory.""" + print("Loading model.") + + # First, load the parameters used to create the model. + model_info = {} + model_info_path = os.path.join(model_dir, 'model_info.pth') + with open(model_info_path, 'rb') as f: + model_info = torch.load(f) + + print("model_info: {}".format(model_info)) + + # Determine the device and construct the model. + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + model = BinaryClassifier(model_info['input_features'], model_info['hidden_dim'], model_info['output_dim']) + + # Load the stored model parameters. + model_path = os.path.join(model_dir, 'model.pth') + with open(model_path, 'rb') as f: + model.load_state_dict(torch.load(f)) + + # set to eval mode, could use no_grad + model.to(device).eval() + + print("Done loading model.") + return model + +# Gets training data in batches from the train.csv file +def _get_train_data_loader(batch_size, training_dir): + print("Get train data loader.") + + train_data = pd.read_csv(os.path.join(training_dir, "train.csv"), header=None, names=None) + + train_y = torch.from_numpy(train_data[[0]].values).float().squeeze() + train_x = torch.from_numpy(train_data.drop([0], axis=1).values).float() + + train_ds = torch.utils.data.TensorDataset(train_x, train_y) + + return torch.utils.data.DataLoader(train_ds, batch_size=batch_size) + + +# Provided training function +def train(model, train_loader, epochs, criterion, optimizer, device): + """ + This is the training method that is called by the PyTorch training script. The parameters + passed are as follows: + model - The PyTorch model that we wish to train. + train_loader - The PyTorch DataLoader that should be used during training. + epochs - The total number of epochs to train for. + criterion - The loss function used for training. + optimizer - The optimizer to use during training. + device - Where the model and data should be loaded (gpu or cpu). + """ + + # training loop is provided + for epoch in range(1, epochs + 1): + model.train() # Make sure that the model is in training mode. + + total_loss = 0 + + for batch in train_loader: + # get data + batch_x, batch_y = batch + + batch_x = batch_x.to(device) + batch_y = batch_y.to(device) + + optimizer.zero_grad() + + # get predictions from model + y_pred = model(batch_x) + + # perform backprop + loss = criterion(y_pred, batch_y) + loss.backward() + optimizer.step() + + total_loss += loss.data.item() + + print("Epoch: {}, Loss: {}".format(epoch, total_loss / len(train_loader))) + + +## TODO: Complete the main code +if __name__ == '__main__': + + # All of the model parameters and training parameters are sent as arguments + # when this script is executed, during a training job + + # Here we set up an argument parser to easily access the parameters + parser = argparse.ArgumentParser() + + # SageMaker parameters, like the directories for training data and saving models; set automatically + # Do not need to change + parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) + parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) + parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAIN']) + + # Training Parameters, given + parser.add_argument('--batch-size', type=int, default=10, metavar='N', + help='input batch size for training (default: 10)') + parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + + ## TODO: Add args for the three model parameters: input_features, hidden_dim, output_dim + # Model Parameters + + + # args holds all passed-in arguments + args = parser.parse_args() + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print("Using device {}.".format(device)) + + torch.manual_seed(args.seed) + + # Load the training data. + train_loader = _get_train_data_loader(args.batch_size, args.data_dir) + + + ## --- Your code here --- ## + + ## TODO: Build the model by passing in the input params + # To get params from the parser, call args.argument_name, ex. args.epochs or ards.hidden_dim + # Don't forget to move your model .to(device) to move to GPU , if appropriate + model = None + + ## TODO: Define an optimizer and loss function for training + optimizer = None + criterion = None + + # Trains the model (given line of code, which calls the above training function) + train(model, train_loader, args.epochs, criterion, optimizer, device) + + ## TODO: complete in the model_info by adding three argument names, the first is given + # Keep the keys of this dictionary as they are + model_info_path = os.path.join(args.model_dir, 'model_info.pth') + with open(model_info_path, 'wb') as f: + model_info = { + 'input_features': args.input_features, + 'hidden_dim': , + 'output_dim': , + } + torch.save(model_info, f) + + ## --- End of your code --- ## + + + # Save the model parameters + model_path = os.path.join(args.model_dir, 'model.pth') + with open(model_path, 'wb') as f: + torch.save(model.cpu().state_dict(), f) diff --git a/source_sklearn/train.py b/source_sklearn/train.py new file mode 100644 index 0000000..f38e268 --- /dev/null +++ b/source_sklearn/train.py @@ -0,0 +1,70 @@ +from __future__ import print_function + +import argparse +import os +import pandas as pd + +from sklearn.externals import joblib + +## TODO: Import any additional libraries you need to define a model +from sklearn.linear_model import LogisticRegression + +# Provided model load function +def model_fn(model_dir): + """Load model from the model_dir. This is the same model that is saved + in the main if statement. + """ + print("Loading model.") + + # load using joblib + model = joblib.load(os.path.join(model_dir, "model.joblib")) + print("Done loading model.") + + return model + + +## TODO: Complete the main code +if __name__ == '__main__': + + # All of the model parameters and training parameters are sent as arguments + # when this script is executed, during a training job + + # Here we set up an argument parser to easily access the parameters + parser = argparse.ArgumentParser() + + # SageMaker parameters, like the directories for training data and saving models; set automatically + # Do not need to change + parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) + parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) + parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAIN']) + + ## TODO: Add any additional arguments that you will need to pass into your model + + # args holds all passed-in arguments + args = parser.parse_args() + + # Read in csv training file + training_dir = args.data_dir + train_data = pd.read_csv(os.path.join(training_dir, "train.csv"), header=None, names=None) + + # Labels are in the first column + train_y = train_data.iloc[:,0] + train_x = train_data.iloc[:,1:] + + + ## --- Your code here --- ## + + + ## TODO: Define a model + model = LogisticRegression() + + + ## TODO: Train the model + model.fit(train_x, train_y) + + + ## --- End of your code --- ## + + + # Save the trained model + joblib.dump(model, os.path.join(args.model_dir, "model.joblib")) \ No newline at end of file