diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..58461f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.ipynb_checkpoints \ No newline at end of file diff --git a/.pytest_cache/v/cache/nodeids b/.pytest_cache/v/cache/nodeids new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/.pytest_cache/v/cache/nodeids @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/src/.pytest_cache/v/cache/nodeids b/src/.pytest_cache/v/cache/nodeids new file mode 100644 index 0000000..0637a08 --- /dev/null +++ b/src/.pytest_cache/v/cache/nodeids @@ -0,0 +1 @@ +[] \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/__pycache__/__init__.cpython-36.pyc b/src/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..d051bdd Binary files /dev/null and b/src/__pycache__/__init__.cpython-36.pyc differ diff --git a/src/__pycache__/assessment.cpython-36.pyc b/src/__pycache__/assessment.cpython-36.pyc new file mode 100644 index 0000000..b957338 Binary files /dev/null and b/src/__pycache__/assessment.cpython-36.pyc differ diff --git a/assessment.py b/src/assessment.py similarity index 72% rename from assessment.py rename to src/assessment.py index 281675d..410b283 100644 --- a/assessment.py +++ b/src/assessment.py @@ -14,7 +14,13 @@ def count_characters(string): Characters which with a count of 0 should not be included in the output dictionary. ''' - pass + char_dict = {} + for char in string: + if char not in char_dict: + char_dict[char] = 1 + else: + char_dict[char] += 1 + return char_dict def invert_dictionary(d): @@ -28,7 +34,13 @@ def invert_dictionary(d): the set of d's keys which shared the same value. e.g. {'a': 2, 'b': 4, 'c': 2} => {2: {'a', 'c'}, 4: {'b'}} ''' - pass + new_d = {} + for k,v in d.items(): + if v not in new_d: + new_d[v] = set(k) + else: + new_d[v].update(k) + return new_d def word_count(filename): @@ -44,7 +56,17 @@ def word_count(filename): 2. number of words (broken by whitespace) 3. number of characters ''' - pass + line_count = 0 + word_count = 0 + char_count = 0 + with open(filename, 'r') as f: + for line in f: + words = line.split() + line_count += 1 + word_count += len(words) + char_count += len(line) + + return line_count, word_count, char_count def matrix_multiplication(A, B): @@ -67,7 +89,24 @@ def matrix_multiplication(A, B): Please do not use numpy. Write your solution in straight python. ''' - pass + + new_matrix = [] + + # for Arow in range(len(A)): + # #new_matrix row.append ETC [row] + # for Bcol in range(len(B[0])): + # for Brow in range(len(B)): + # new_matrix[Arow][Bcol] += A[Arow][Brow] * B[Brow][Bcol] + + new_matrix = [[sum(a*b for a,b in zip(A_row, B_col)) + for B_col in zip(*B)] for A_row in A] + + return new_matrix + +# a = [[5, 3, 8], [9, 0, 1],[6,9,3]] +# b = [[1, 2, 3], [4, 5, 6],[6,1,2]] +# print(matrix_multiplication(a,b)) + # NumPy SECTION @@ -89,7 +128,8 @@ def array_work(rows, cols, scalar, matrixA): [5, 6], * [5, 5, 5]] [7, 8]] ''' - pass + matrixNew = np.full((rows, cols), scalar) + return np.dot(matrixA, matrixNew) def boolean_indexing(arr, minimum): @@ -105,7 +145,7 @@ def boolean_indexing(arr, minimum): In [1]: boolean_indexing([[3, 4, 5], [6, 7, 8]], 7) Out[1]: array([7, 8]) ''' - pass + return arr[arr >= minimum] # Pandas SECTION @@ -128,7 +168,9 @@ def make_series(start, length, index): c 7 dtype: int64 ''' - pass + return pd.Series(range(start,start+length), index=index) + +# print (make_series(5, 3, ['a', 'b', 'c'])) def data_frame_work(df, colA, colB, colC): @@ -139,4 +181,4 @@ def data_frame_work(df, colA, colB, colC): Insert a column (colC) into the dataframe that is the sum of colA and colB. Assume that df contains columns colA and colB and that these are numeric. ''' - pass + df[colC] = df[colA] + df[colB] diff --git a/test/.pytest_cache/v/cache/lastfailed b/test/.pytest_cache/v/cache/lastfailed new file mode 100644 index 0000000..c878883 --- /dev/null +++ b/test/.pytest_cache/v/cache/lastfailed @@ -0,0 +1,3 @@ +{ + "testing.py::test_word_count": true +} \ No newline at end of file diff --git a/test/.pytest_cache/v/cache/nodeids b/test/.pytest_cache/v/cache/nodeids new file mode 100644 index 0000000..86b78dc --- /dev/null +++ b/test/.pytest_cache/v/cache/nodeids @@ -0,0 +1,3 @@ +[ + "testing.py::test_matrix_multiplication" +] \ No newline at end of file diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/__pycache__/__init__.cpython-36.pyc b/test/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..0cae296 Binary files /dev/null and b/test/__pycache__/__init__.cpython-36.pyc differ diff --git a/test/__pycache__/testing.cpython-36-PYTEST.pyc b/test/__pycache__/testing.cpython-36-PYTEST.pyc new file mode 100644 index 0000000..ac7882f Binary files /dev/null and b/test/__pycache__/testing.cpython-36-PYTEST.pyc differ diff --git a/test/alice.txt b/test/alice.txt new file mode 100644 index 0000000..84bf3cc --- /dev/null +++ b/test/alice.txt @@ -0,0 +1,17 @@ +Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversations?' +So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her. +There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually TOOK A WATCH OUT OF ITS WAISTCOAT-POCKET, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge. +In another moment down went Alice after it, never once considering how in the world she was to get out again. +The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well. +Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it. +'Well!' thought Alice to herself, 'after such a fall as this, I shall think nothing of tumbling down stairs! How brave they'll all think me at home! Why, I wouldn't say anything about it, even if I fell off the top of the house!' (Which was very likely true.) +Down, down, down. Would the fall NEVER come to an end! 'I wonder how many miles I've fallen by this time?' she said aloud. 'I must be getting somewhere near the centre of the earth. Let me see: that would be four thousand miles down, I think—' (for, you see, Alice had learnt several things of this sort in her lessons in the schoolroom, and though this was not a VERY good opportunity for showing off her knowledge, as there was no one to listen to her, still it was good practice to say it over) '—yes, that's about the right distance—but then I wonder what Latitude or Longitude I've got to?' (Alice had no idea what Latitude was, or Longitude either, but thought they were nice grand words to say.) +Presently she began again. 'I wonder if I shall fall right THROUGH the earth! How funny it'll seem to come out among the people that walk with their heads downward! The Antipathies, I think—' (she was rather glad there WAS no one listening, this time, as it didn't sound at all the right word) '—but I shall have to ask them what the name of the country is, you know. Please, Ma'am, is this New Zealand or Australia?' (and she tried to curtsey as she spoke—fancy CURTSEYING as you're falling through the air! Do you think you could manage it?) 'And what an ignorant little girl she'll think me for asking! No, it'll never do to ask: perhaps I shall see it written up somewhere.' +Down, down, down. There was nothing else to do, so Alice soon began talking again. 'Dinah'll miss me very much to-night, I should think!' (Dinah was the cat.) 'I hope they'll remember her saucer of milk at tea-time. Dinah my dear! I wish you were down here with me! There are no mice in the air, I'm afraid, but you might catch a bat, and that's very like a mouse, you know. But do cats eat bats, I wonder?' And here Alice began to get rather sleepy, and went on saying to herself, in a dreamy sort of way, 'Do cats eat bats? Do cats eat bats?' and sometimes, 'Do bats eat cats?' for, you see, as she couldn't answer either question, it didn't much matter which way she put it. She felt that she was dozing off, and had just begun to dream that she was walking hand in hand with Dinah, and saying to her very earnestly, 'Now, Dinah, tell me the truth: did you ever eat a bat?' when suddenly, thump! thump! down she came upon a heap of sticks and dry leaves, and the fall was over. +Alice was not a bit hurt, and she jumped up on to her feet in a moment: she looked up, but it was all dark overhead; before her was another long passage, and the White Rabbit was still in sight, hurrying down it. There was not a moment to be lost: away went Alice like the wind, and was just in time to hear it say, as it turned a corner, 'Oh my ears and whiskers, how late it's getting!' She was close behind it when she turned the corner, but the Rabbit was no longer to be seen: she found herself in a long, low hall, which was lit up by a row of lamps hanging from the roof. +There were doors all round the hall, but they were all locked; and when Alice had been all the way down one side and up the other, trying every door, she walked sadly down the middle, wondering how she was ever to get out again. +Suddenly she came upon a little three-legged table, all made of solid glass; there was nothing on it except a tiny golden key, and Alice's first thought was that it might belong to one of the doors of the hall; but, alas! either the locks were too large, or the key was too small, but at any rate it would not open any of them. However, on the second time round, she came upon a low curtain she had not noticed before, and behind it was a little door about fifteen inches high: she tried the little golden key in the lock, and to her great delight it fitted! +Alice opened the door and found that it led into a small passage, not much larger than a rat-hole: she knelt down and looked along the passage into the loveliest garden you ever saw. How she longed to get out of that dark hall, and wander about among those beds of bright flowers and those cool fountains, but she could not even get her head through the doorway; 'and even if my head would go through,' thought poor Alice, 'it would be of very little use without my shoulders. Oh, how I wish I could shut up like a telescope! I think I could, if I only knew how to begin.' For, you see, so many out-of-the-way things had happened lately, that Alice had begun to think that very few things indeed were really impossible. +There seemed to be no use in waiting by the little door, so she went back to the table, half hoping she might find another key on it, or at any rate a book of rules for shutting people up like telescopes: this time she found a little bottle on it, ('which certainly was not here before,' said Alice,) and round the neck of the bottle was a paper label, with the words 'DRINK ME' beautifully printed on it in large letters. +It was all very well to say 'Drink me,' but the wise little Alice was not going to do THAT in a hurry. 'No, I'll look first,' she said, 'and see whether it's marked "poison" or not'; for she had read several nice little histories about children who had got burnt, and eaten up by wild beasts and other unpleasant things, all because they WOULD not remember the simple rules their friends had taught them: such as, that a red-hot poker will burn you if you hold it too long; and that if you cut your finger VERY deeply with a knife, it usually bleeds; and she had never forgotten that, if you drink much from a bottle marked 'poison,' it is almost certain to disagree with you, sooner or later. +However, this bottle was NOT marked 'poison,' so Alice ventured to taste it, and finding it very nice, (it had, in fact, a sort of mixed flavour of cherry-tart, custard, pine-apple, roast turkey, toffee, and hot buttered toast,) she very soon finished it off. diff --git a/test/testing.py b/test/testing.py new file mode 100644 index 0000000..5d0af2b --- /dev/null +++ b/test/testing.py @@ -0,0 +1,87 @@ +import sys +sys.path.append('../') +from src import assessment #as a +# from src.vector import Vector +import numpy as np +import pandas as pd +from pytest import approx + + +# def test_count_characters(): +# string = "abafdcggfaabe" +# answer = {"a": 4, "b": 2, "c": 1, "d": 1, "e": 1, "f": 2, "g": 2} +# result = assessment.count_characters(string) +# assert result == answer + + +# def test_invert_dictionary(): +# d = {"a": 4, "b": 2, "c": 1, "d": 1, "e": 1, "f": 2, "g": 2} +# result = {4: {'a'}, 2: {'b', 'f', 'g'}, 1: {'c', 'd', 'e'}} +# assert assessment.invert_dictionary(d) == result + + +# def test_word_count(): +# assert assessment.word_count('alice.txt') == (17, 1615, 8449) +### not sure why failing this one; just off on char_count by 12 chars + +def test_matrix_multiplication(): + A = [[2, 3, 4], [6, 4, 2], [-1, 2, 0]] + B = [[8, -3, 1], [-7, 3, 2], [0, 3, 3]] + answer = [[-5, 15, 20], [20, 0, 20], [-22, 9, 3]] + assert assessment.matrix_multiplication(A, B) == answer +### can't get this to work + + +# def test_array_work(): +# matrixA = np.array([[-4, -2], +# [0, -3], +# [-4, -1], +# [-1, 1], +# [-3, 0]]) +# answer1 = np.array([[-24, -24, -24], +# [-12, -12, -12], +# [-20, -20, -20], +# [0, 0, 0], +# [-12, -12, -12]]) +# result1 = assessment.array_work(2, 3, 4, matrixA) +# assert np.all(answer1 == result1) + +# answer2 = np.array([[-36, -36], +# [-18, -18], +# [-30, -30], +# [0, 0], +# [-18, -18]]) + # result2 = assessment.array_work(2, 2, 6, matrixA) + # assert np.all(answer2 == result2) + + +# def test_make_series(): +# result = assessment.make_series(7, 4, ['a', 'b', 'c', 'd']) +# assert isinstance(result, pd.Series) +# assert result['a'] == 7 +# assert result['d'] == 10 + +# result = assessment.make_series(22, 5, ['a', 'b', 'c', 'd', 'hi']) +# assert result['a'] == 22 +# assert result['d'] == 25 +# assert result['hi'] == 26 + + +# def test_data_frame_work(): +# df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) +# colA, colB, colC = ('a', 'b', 'c') +# assessment.data_frame_work(df, colA, colB, colC) +# assert colC in df.columns.tolist() +# assert df[colC].tolist(), [5, 7, 9] + + +# def test_boolean_indexing(): +# arr = np.array([[-4, -4, -3], +# [-1, 16, -4], +# [-3, 6, 4]]) +# result1 = assessment.boolean_indexing(arr, 0) +# answer1 = np.array([16, 6, 4]) +# assert np.all(result1 == answer1) +# result2 = assessment.boolean_indexing(arr, 10) +# answer2 = np.array([16]) +# assert np.all(result2 == answer2) diff --git a/testing.py b/testing.py deleted file mode 100644 index c68b010..0000000 --- a/testing.py +++ /dev/null @@ -1,77 +0,0 @@ -def test_count_characters(self): - string = "abafdcggfaabe" - answer = {"a": 4, "b": 2, "c": 1, "d": 1, "e": 1, "f": 2, "g": 2} - result = a.count_characters(string) - self.assertEqual(result, answer) - - -def test_invert_dictionary(self): - d = {"a": 4, "b": 2, "c": 1, "d": 1, "e": 1, "f": 2, "g": 2} - result = {4: {'a'}, 2: {'b', 'f', 'g'}, 1: {'c', 'd', 'e'}} - self.assertEqual(a.invert_dictionary(d), result) - - -def test_word_count(self): - self.assertEqual(a.word_count('data/alice.txt'), (17, 1615, 8449)) - - -def test_matrix_multiplication(self): - A = [[2, 3, 4], [6, 4, 2], [-1, 2, 0]] - B = [[8, -3, 1], [-7, 3, 2], [0, 3, 3]] - answer = [[-5, 15, 20], [20, 0, 20], [-22, 9, 3]] - self.assertEqual(a.matrix_multiplication(A, B), answer) - - -def test_array_work(self): - matrixA = np.array([[-4, -2], - [0, -3], - [-4, -1], - [-1, 1], - [-3, 0]]) - answer1 = np.array([[-24, -24, -24], - [-12, -12, -12], - [-20, -20, -20], - [0, 0, 0], - [-12, -12, -12]]) - result1 = a.array_work(2, 3, 4, matrixA) - self.assertTrue(np.all(answer1 == result1)) - - answer2 = np.array([[-36, -36], - [-18, -18], - [-30, -30], - [0, 0], - [-18, -18]]) - result2 = a.array_work(2, 2, 6, matrixA) - self.assertTrue(np.all(answer2 == result2)) - - -def test_make_series(self): - result = a.make_series(7, 4, ['a', 'b', 'c', 'd']) - self.assertTrue(isinstance(result, pd.Series)) - self.assertEqual(result['a'], 7) - self.assertEqual(result['d'], 10) - - result = a.make_series(22, 5, ['a', 'b', 'c', 'd', 'hi']) - self.assertEqual(result['a'], 22) - self.assertEqual(result['d'], 25) - self.assertEqual(result['hi'], 26) - - -def test_data_frame_work(self): - df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) - colA, colB, colC = ('a', 'b', 'c') - a.data_frame_work(df, colA, colB, colC) - self.assertTrue(colC in df.columns.tolist()) - self.assertEqual(df[colC].tolist(), [5, 7, 9]) - - -def test_boolean_indexing(self): - arr = np.array([[-4, -4, -3], - [-1, 16, -4], - [-3, 6, 4]]) - result1 = a.boolean_indexing(arr, 0) - answer1 = np.array([16, 6, 4]) - self.assertTrue(np.all(result1 == answer1)) - result2 = a.boolean_indexing(arr, 10) - answer2 = np.array([16]) - self.assertTrue(np.all(result2 == answer2)) diff --git a/titanic.ipynb b/titanic.ipynb new file mode 100644 index 0000000..481a567 --- /dev/null +++ b/titanic.ipynb @@ -0,0 +1,2868 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1226, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import statsmodels.api as sm\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import r2_score\n", + "from pandas.plotting import scatter_matrix\n", + "from sklearn.linear_model import LogisticRegression\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 1227, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('train.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 1228, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
countmeanstdmin25%50%75%max
PassengerId891.0446.000000257.3538421.00223.5000446.0000668.5891.0000
Survived891.00.3838380.4865920.000.00000.00001.01.0000
Pclass891.02.3086420.8360711.002.00003.00003.03.0000
Age714.029.69911814.5264970.4220.125028.000038.080.0000
SibSp891.00.5230081.1027430.000.00000.00001.08.0000
Parch891.00.3815940.8060570.000.00000.00000.06.0000
Fare891.032.20420849.6934290.007.910414.454231.0512.3292
\n", + "
" + ], + "text/plain": [ + " count mean std min 25% 50% 75% \\\n", + "PassengerId 891.0 446.000000 257.353842 1.00 223.5000 446.0000 668.5 \n", + "Survived 891.0 0.383838 0.486592 0.00 0.0000 0.0000 1.0 \n", + "Pclass 891.0 2.308642 0.836071 1.00 2.0000 3.0000 3.0 \n", + "Age 714.0 29.699118 14.526497 0.42 20.1250 28.0000 38.0 \n", + "SibSp 891.0 0.523008 1.102743 0.00 0.0000 0.0000 1.0 \n", + "Parch 891.0 0.381594 0.806057 0.00 0.0000 0.0000 0.0 \n", + "Fare 891.0 32.204208 49.693429 0.00 7.9104 14.4542 31.0 \n", + "\n", + " max \n", + "PassengerId 891.0000 \n", + "Survived 1.0000 \n", + "Pclass 3.0000 \n", + "Age 80.0000 \n", + "SibSp 8.0000 \n", + "Parch 6.0000 \n", + "Fare 512.3292 " + ] + }, + "execution_count": 1228, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe().T" + ] + }, + { + "cell_type": "code", + "execution_count": 1229, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedPclassSexAgeSibSpParchFareEmbarked
003male22.0107.2500S
111female38.01071.2833C
213female26.0007.9250S
311female35.01053.1000S
403male35.0008.0500S
\n", + "
" + ], + "text/plain": [ + " Survived Pclass Sex Age SibSp Parch Fare Embarked\n", + "0 0 3 male 22.0 1 0 7.2500 S\n", + "1 1 1 female 38.0 1 0 71.2833 C\n", + "2 1 3 female 26.0 0 0 7.9250 S\n", + "3 1 1 female 35.0 1 0 53.1000 S\n", + "4 0 3 male 35.0 0 0 8.0500 S" + ] + }, + "execution_count": 1229, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = df.drop(columns=['PassengerId','Name','Cabin','Ticket'])\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1230, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 891 entries, 0 to 890\n", + "Data columns (total 8 columns):\n", + "Survived 891 non-null int64\n", + "Pclass 891 non-null int64\n", + "Sex 891 non-null object\n", + "Age 714 non-null float64\n", + "SibSp 891 non-null int64\n", + "Parch 891 non-null int64\n", + "Fare 891 non-null float64\n", + "Embarked 889 non-null object\n", + "dtypes: float64(2), int64(4), object(2)\n", + "memory usage: 55.8+ KB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 1231, + "metadata": {}, + "outputs": [], + "source": [ + "df[df.Age.isna()]\n", + "age_median = df.Age.median()\n", + "df['Age'] = df.Age.fillna(age_median)\n", + "# also try using mean?" + ] + }, + { + "cell_type": "code", + "execution_count": 1232, + "metadata": {}, + "outputs": [], + "source": [ + "df[df.Embarked.isna()]\n", + "embarked_mode = df.Embarked.mode()\n", + "df['Embarked'] = df.Embarked.fillna(embarked_mode)\n", + "# also try dropping if embarked is important???" + ] + }, + { + "cell_type": "code", + "execution_count": 1233, + "metadata": {}, + "outputs": [], + "source": [ + "# df['Pclass'] = pd.to_string(df.Pclass, errors='coerce')\n", + "df['Pclass'] = df['Pclass'].astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 1234, + "metadata": {}, + "outputs": [], + "source": [ + "# baseline is guess of non-survival\n", + "# or probability of survival based on whole pop, .38 = (342 / (342+549))" + ] + }, + { + "cell_type": "code", + "execution_count": 1235, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PclassSexAgeSibSpParchFareEmbarked
03male22.0107.2500S
11female38.01071.2833C
23female26.0007.9250S
31female35.01053.1000S
43male35.0008.0500S
\n", + "
" + ], + "text/plain": [ + " Pclass Sex Age SibSp Parch Fare Embarked\n", + "0 3 male 22.0 1 0 7.2500 S\n", + "1 1 female 38.0 1 0 71.2833 C\n", + "2 3 female 26.0 0 0 7.9250 S\n", + "3 1 female 35.0 1 0 53.1000 S\n", + "4 3 male 35.0 0 0 8.0500 S" + ] + }, + "execution_count": 1235, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# df.groupby([int(a)//2 for a in 'Age'])['Survived'].count()\n", + "y = df.Survived\n", + "X = df.drop(columns=['Survived'])\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1236, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constAgeSibSpParchFarePclass_2Pclass_3Sex_maleEmbarked_QEmbarked_S
01.022.0107.250001101
11.038.01071.283300000
21.026.0007.925001001
31.035.01053.100000001
41.035.0008.050001101
\n", + "
" + ], + "text/plain": [ + " const Age SibSp Parch Fare Pclass_2 Pclass_3 Sex_male \\\n", + "0 1.0 22.0 1 0 7.2500 0 1 1 \n", + "1 1.0 38.0 1 0 71.2833 0 0 0 \n", + "2 1.0 26.0 0 0 7.9250 0 1 0 \n", + "3 1.0 35.0 1 0 53.1000 0 0 0 \n", + "4 1.0 35.0 0 0 8.0500 0 1 1 \n", + "\n", + " Embarked_Q Embarked_S \n", + "0 0 1 \n", + "1 0 0 \n", + "2 0 1 \n", + "3 0 1 \n", + "4 0 1 " + ] + }, + "execution_count": 1236, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X =pd.get_dummies(X)\n", + "# prob drop PassengerID and will need to drop one and dummy Sex, Embarked, Pclass\n", + "X = X.drop(columns=['Sex_female','Embarked_C','Pclass_1'])\n", + "X = sm.add_constant(X)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1237, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 549\n", + "1 342\n", + "Name: Survived, dtype: int64" + ] + }, + "execution_count": 1237, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAENRJREFUeJzt3X+s3XV9x/Hn21bUta5Fqjek7VYWayKRqHhDuphst9a4ggvlD1gwOApp1sSxxU2y2W3J3K8/YAtjgRj1bjiKQQtzc22AzZDCCXNZme1QCjLDFTu4ltBpy92uqFvne3+cT8213HK/9/y4397PfT6Sm/P9fr6fcz6f9+nt637v55zzvZGZSJLq9aq2JyBJGi6DXpIqZ9BLUuUMekmqnEEvSZUz6CWpcga9JFXOoJekyhn0klS55W1PAGDNmjW5YcOGnu773e9+lxUrVgx2Qmc5a14arHlp6KfmQ4cOfTsz3zhXv7Mi6Dds2MDBgwd7um+n02FsbGywEzrLWfPSYM1LQz81R8R/NOnn0o0kVc6gl6TKGfSSVDmDXpIqZ9BLUuUMekmqnEEvSZUz6CWpcga9JFXurPhkbD8Of2uK63bd38rYR256fyvjStJ8eEYvSZUz6CWpcga9JFXOoJekyhn0klQ5g16SKmfQS1LlDHpJqpxBL0mVM+glqXIGvSRVzqCXpMoZ9JJUOYNekirXKOgj4khEHI6Ir0TEwdL2hoh4MCKeLrfnlvaIiNsiYiIiHo+Ii4dZgCTplc3njH5zZr4jM0fL/i5gf2ZuBPaXfYBLgY3layfwiUFNVpI0f/0s3WwDdpft3cAVM9rvyq4DwOqIOL+PcSRJfYjMnLtTxDeBE0ACn8rM8Yh4MTNXz+hzIjPPjYj7gJsy80ulfT/w0cw8eNpj7qR7xs/IyMi79uzZ01MBx45P8cL3erpr3y5au6qVcaenp1m5cmUrY7fFmpcGa56fzZs3H5qxynJGTf+U4Lsz82hEvAl4MCL+/RX6xixtL/tpkpnjwDjA6Ohojo2NNZzKj7v97r3ccridv4h45JqxVsbtdDr0+nwtVta8NFjzcDRausnMo+X2GPAF4BLghVNLMuX2WOk+Cayfcfd1wNFBTViSND9zBn1ErIiI15/aBt4HPAHsA7aXbtuBvWV7H3BteffNJmAqM58f+MwlSY00WfMYAb4QEaf6fzYz/zEivgzcGxE7gGeBq0r/B4DLgAngJeD6gc9aktTYnEGfmc8Ab5+l/TvAllnaE7hhILOTJPXNT8ZKUuUMekmqnEEvSZUz6CWpcga9JFXOoJekyhn0klQ5g16SKmfQS1LlDHpJqpxBL0mVM+glqXIGvSRVzqCXpMoZ9JJUOYNekipn0EtS5Qx6SaqcQS9JlTPoJalyBr0kVc6gl6TKGfSSVDmDXpIqZ9BLUuUMekmqnEEvSZUz6CWpco2DPiKWRcRjEXFf2b8gIh6NiKcj4p6IOKe0v6bsT5TjG4YzdUlSE/M5o/8w8NSM/ZuBWzNzI3AC2FHadwAnMvPNwK2lnySpJY2CPiLWAe8H/qrsB/Ae4POly27girK9rexTjm8p/SVJLWh6Rv8XwG8DPyz75wEvZubJsj8JrC3ba4HnAMrxqdJfktSC5XN1iIhfBI5l5qGIGDvVPEvXbHBs5uPuBHYCjIyM0Ol0msz3ZUZeBzdedHLujkPQ65z7NT093drYbbHmpcGah2POoAfeDVweEZcBrwV+ku4Z/uqIWF7O2tcBR0v/SWA9MBkRy4FVwPHTHzQzx4FxgNHR0RwbG+upgNvv3ssth5uUMXhHrhlrZdxOp0Ovz9diZc1LgzUPx5xLN5n5O5m5LjM3AFcDD2XmNcDDwJWl23Zgb9neV/Ypxx/KzJed0UuSFkY/76P/KPCRiJiguwZ/R2m/AzivtH8E2NXfFCVJ/ZjXmkdmdoBO2X4GuGSWPt8HrhrA3CRJA+AnYyWpcga9JFXOoJekyhn0klQ5g16SKmfQS1LlDHpJqpxBL0mVM+glqXIGvSRVzqCXpMoZ9JJUOYNekipn0EtS5Qx6SaqcQS9JlTPoJaly7fxVbUk6i2zYdX9rY9+5dcXQx/CMXpIqZ9BLUuUMekmqnEEvSZUz6CWpcga9JFXOoJekyhn0klQ5g16SKmfQS1LlDHpJqtycQR8Rr42If42Ir0bEkxHxh6X9goh4NCKejoh7IuKc0v6asj9Rjm8YbgmSpFfS5Iz+B8B7MvPtwDuArRGxCbgZuDUzNwIngB2l/w7gRGa+Gbi19JMktWTOoM+u6bL76vKVwHuAz5f23cAVZXtb2acc3xIRMbAZS5LmpdEafUQsi4ivAMeAB4FvAC9m5snSZRJYW7bXAs8BlONTwHmDnLQkqbnIzOadI1YDXwB+H/jrsjxDRKwHHsjMiyLiSeAXMnOyHPsGcElmfue0x9oJ7AQYGRl51549e3oq4NjxKV74Xk937dtFa1e1Mu709DQrV65sZey2WPPS0FbNh781teBjnnLBqmU917x58+ZDmTk6V795/eGRzHwxIjrAJmB1RCwvZ+3rgKOl2ySwHpiMiOXAKuD4LI81DowDjI6O5tjY2Hym8iO3372XWw638/dTjlwz1sq4nU6HXp+vxcqal4a2ar6u5T88Muyam7zr5o3lTJ6IeB3wXuAp4GHgytJtO7C3bO8r+5TjD+V8fm2QJA1Uk1Ph84HdEbGM7g+GezPzvoj4GrAnIv4EeAy4o/S/A/hMREzQPZO/egjzliQ1NGfQZ+bjwDtnaX8GuGSW9u8DVw1kdpKkvvnJWEmqnEEvSZUz6CWpcga9JFXOoJekyhn0klQ5g16SKmfQS1LlDHpJqpxBL0mVM+glqXIGvSRVzqCXpMoZ9JJUOYNekipn0EtS5Qx6SaqcQS9JlTPoJalyBr0kVc6gl6TKGfSSVDmDXpIqZ9BLUuUMekmqnEEvSZUz6CWpcga9JFXOoJekys0Z9BGxPiIejoinIuLJiPhwaX9DRDwYEU+X23NLe0TEbRExERGPR8TFwy5CknRmTc7oTwI3ZuZbgU3ADRFxIbAL2J+ZG4H9ZR/gUmBj+doJfGLgs5YkNTZn0Gfm85n5b2X7v4GngLXANmB36bYbuKJsbwPuyq4DwOqIOH/gM5ckNRKZ2bxzxAbgEeBtwLOZuXrGsROZeW5E3AfclJlfKu37gY9m5sHTHmsn3TN+RkZG3rVnz56eCjh2fIoXvtfTXft20dpVrYw7PT3NypUrWxm7Lda8NLRV8+FvTS34mKdcsGpZzzVv3rz5UGaOztVvedMHjIiVwN8Cv5GZ/xURZ+w6S9vLfppk5jgwDjA6OppjY2NNp/Jjbr97L7ccblzGQB25ZqyVcTudDr0+X4uVNS8NbdV83a77F3zMU+7cumLoNTd6101EvJpuyN+dmX9Xml84tSRTbo+V9klg/Yy7rwOODma6kqT5avKumwDuAJ7KzD+fcWgfsL1sbwf2zmi/trz7ZhMwlZnPD3DOkqR5aLLm8W7gl4HDEfGV0va7wE3AvRGxA3gWuKocewC4DJgAXgKuH+iMJUnzMmfQlxdVz7Qgv2WW/gnc0Oe8JEkD4idjJalyBr0kVc6gl6TKGfSSVDmDXpIqZ9BLUuUMekmqnEEvSZUz6CWpcga9JFXOoJekyhn0klQ5g16SKmfQS1LlDHpJqpxBL0mVM+glqXIGvSRVzqCXpMoZ9JJUOYNekipn0EtS5Qx6SaqcQS9JlTPoJalyBr0kVc6gl6TKGfSSVLk5gz4iPh0RxyLiiRltb4iIByPi6XJ7bmmPiLgtIiYi4vGIuHiYk5ckza3JGf2dwNbT2nYB+zNzI7C/7ANcCmwsXzuBTwxmmpKkXs0Z9Jn5CHD8tOZtwO6yvRu4Ykb7Xdl1AFgdEecParKSpPnrdY1+JDOfByi3byrta4HnZvSbLG2SpJYsH/DjxSxtOWvHiJ10l3cYGRmh0+n0NODI6+DGi072dN9+9Trnfk1PT7c2dluseWloq+a2MgQWpuZeg/6FiDg/M58vSzPHSvsksH5Gv3XA0dkeIDPHgXGA0dHRHBsb62kit9+9l1sOD/rnVTNHrhlrZdxOp0Ovz9diZc1LQ1s1X7fr/gUf85Q7t64Yes29Lt3sA7aX7e3A3hnt15Z332wCpk4t8UiS2jHnqXBEfA4YA9ZExCTwMeAm4N6I2AE8C1xVuj8AXAZMAC8B1w9hzpKkeZgz6DPzA2c4tGWWvgnc0O+kJEmD4ydjJalyBr0kVc6gl6TKGfSSVDmDXpIqZ9BLUuUMekmqnEEvSZUz6CWpcga9JFXOoJekyhn0klQ5g16SKmfQS1LlDHpJqpxBL0mVM+glqXIGvSRVzqCXpMoZ9JJUOYNekipn0EtS5Qx6SaqcQS9JlTPoJalyBr0kVc6gl6TKGfSSVDmDXpIqN5Sgj4itEfH1iJiIiF3DGEOS1MzAgz4ilgEfBy4FLgQ+EBEXDnocSVIzwzijvwSYyMxnMvN/gD3AtiGMI0lqYBhBvxZ4bsb+ZGmTJLVg+RAeM2Zpy5d1itgJ7Cy70xHx9R7HWwN8u8f79iVubmNUoMWaW2TNS8OSq3nzzX3V/NNNOg0j6CeB9TP21wFHT++UmePAeL+DRcTBzBzt93EWE2teGqx5aViImoexdPNlYGNEXBAR5wBXA/uGMI4kqYGBn9Fn5smI+DXgi8Ay4NOZ+eSgx5EkNTOMpRsy8wHggWE89iz6Xv5ZhKx5abDmpWHoNUfmy14nlSRVxEsgSFLlFk3Qz3VZhYh4TUTcU44/GhEbFn6Wg9Wg5o9ExNci4vGI2B8Rjd5qdTZrevmMiLgyIjIiFv07NJrUHBG/VP6tn4yIzy70HAetwff2T0XEwxHxWPn+vqyNeQ5KRHw6Io5FxBNnOB4RcVt5Ph6PiIsHOoHMPOu/6L6o+w3gZ4BzgK8CF57W51eBT5btq4F72p73AtS8GfiJsv2hpVBz6fd64BHgADDa9rwX4N95I/AYcG7Zf1Pb816AmseBD5XtC4Ejbc+7z5p/DrgYeOIMxy8D/oHu55A2AY8OcvzFckbf5LIK24DdZfvzwJaImO3DW4vFnDVn5sOZ+VLZPUD3MwuLWdPLZ/wx8KfA9xdyckPSpOZfAT6emScAMvPYAs9x0JrUnMBPlu1VzPJZnMUkMx8Bjr9Cl23AXdl1AFgdEecPavzFEvRNLqvwoz6ZeRKYAs5bkNkNx3wvJbGD7hnBYjZnzRHxTmB9Zt63kBMboib/zm8B3hIR/xwRByJi64LNbjia1PwHwAcjYpLuO/h+fWGm1pqhXjpmKG+vHIIml1VodOmFRaRxPRHxQWAU+Pmhzmj4XrHmiHgVcCtw3UJNaAE0+XdeTnf5Zozub23/FBFvy8wXhzy3YWlS8weAOzPzloj4WeAzpeYfDn96rRhqfi2WM/oml1X4UZ+IWE73171X+lXpbNfoUhIR8V7g94DLM/MHCzS3YZmr5tcDbwM6EXGE7lrmvkX+gmzT7+29mfm/mflN4Ot0g3+xalLzDuBegMz8F+C1dK+DU6tG/997tViCvsllFfYB28v2lcBDWV7lWKTmrLksY3yKbsgv9nVbmKPmzJzKzDWZuSEzN9B9XeLyzDzYznQHosn39t/TfeGdiFhDdynnmQWd5WA1qflZYAtARLyVbtD/54LOcmHtA64t777ZBExl5vODevBFsXSTZ7isQkT8EXAwM/cBd9D99W6C7pn81e3NuH8Na/4zYCXwN+V152cz8/LWJt2nhjVXpWHNXwTeFxFfA/4P+K3M/E57s+5Pw5pvBP4yIn6T7hLGdYv5xC0iPkd36W1Ned3hY8CrATLzk3Rfh7gMmABeAq4f6PiL+LmTJDWwWJZuJEk9MuglqXIGvSRVzqCXpMoZ9JJUOYNekipn0EtS5Qx6Sarc/wOuFeo7bmQhLgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#df['Survived'].hist()\n", + "y.hist()\n", + "y.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 1238, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Age -0.064910\n", + "SibSp -0.035322\n", + "Parch 0.081629\n", + "Fare 0.257307\n", + "Survived 1.000000\n", + "Name: Survived, dtype: float64" + ] + }, + "execution_count": 1238, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.corr()['Survived'].sort_values()" + ] + }, + { + "cell_type": "code", + "execution_count": 1239, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 1240, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(596, 10)" + ] + }, + "execution_count": 1240, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 1241, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
OLS Regression Results
Dep. Variable: Survived R-squared: 0.374
Model: OLS Adj. R-squared: 0.364
Method: Least Squares F-statistic: 38.84
Date: Sun, 22 Jul 2018 Prob (F-statistic): 3.69e-54
Time: 15:14:03 Log-Likelihood: -273.13
No. Observations: 596 AIC: 566.3
Df Residuals: 586 BIC: 610.2
Df Model: 9
Covariance Type: nonrobust
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err t P>|t| [0.025 0.975]
const 1.0607 0.076 13.892 0.000 0.911 1.211
Age -0.0040 0.001 -2.935 0.003 -0.007 -0.001
SibSp -0.0337 0.015 -2.271 0.023 -0.063 -0.005
Parch -0.0203 0.023 -0.872 0.384 -0.066 0.025
Fare 0.0006 0.000 1.615 0.107 -0.000 0.001
Pclass_2 -0.0339 0.056 -0.600 0.549 -0.145 0.077
Pclass_3 -0.2488 0.053 -4.714 0.000 -0.352 -0.145
Sex_male -0.4913 0.035 -13.881 0.000 -0.561 -0.422
Embarked_Q -0.0509 0.068 -0.743 0.458 -0.185 0.084
Embarked_S -0.1261 0.044 -2.862 0.004 -0.213 -0.040
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Omnibus: 32.662 Durbin-Watson: 2.036
Prob(Omnibus): 0.000 Jarque-Bera (JB): 36.901
Skew: 0.608 Prob(JB): 9.71e-09
Kurtosis: 3.081 Cond. No. 378.


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Survived R-squared: 0.374\n", + "Model: OLS Adj. R-squared: 0.364\n", + "Method: Least Squares F-statistic: 38.84\n", + "Date: Sun, 22 Jul 2018 Prob (F-statistic): 3.69e-54\n", + "Time: 15:14:03 Log-Likelihood: -273.13\n", + "No. Observations: 596 AIC: 566.3\n", + "Df Residuals: 586 BIC: 610.2\n", + "Df Model: 9 \n", + "Covariance Type: nonrobust \n", + "==============================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "const 1.0607 0.076 13.892 0.000 0.911 1.211\n", + "Age -0.0040 0.001 -2.935 0.003 -0.007 -0.001\n", + "SibSp -0.0337 0.015 -2.271 0.023 -0.063 -0.005\n", + "Parch -0.0203 0.023 -0.872 0.384 -0.066 0.025\n", + "Fare 0.0006 0.000 1.615 0.107 -0.000 0.001\n", + "Pclass_2 -0.0339 0.056 -0.600 0.549 -0.145 0.077\n", + "Pclass_3 -0.2488 0.053 -4.714 0.000 -0.352 -0.145\n", + "Sex_male -0.4913 0.035 -13.881 0.000 -0.561 -0.422\n", + "Embarked_Q -0.0509 0.068 -0.743 0.458 -0.185 0.084\n", + "Embarked_S -0.1261 0.044 -2.862 0.004 -0.213 -0.040\n", + "==============================================================================\n", + "Omnibus: 32.662 Durbin-Watson: 2.036\n", + "Prob(Omnibus): 0.000 Jarque-Bera (JB): 36.901\n", + "Skew: 0.608 Prob(JB): 9.71e-09\n", + "Kurtosis: 3.081 Cond. No. 378.\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "\"\"\"" + ] + }, + "execution_count": 1241, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = sm.OLS(y_train, X_train)\n", + "results = model.fit()\n", + "results.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 1242, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", + " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 1242, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = LogisticRegression()\n", + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 1243, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,\n", + " 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n", + " 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,\n", + " 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,\n", + " 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,\n", + " 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,\n", + " 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,\n", + " 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,\n", + " 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,\n", + " 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 0, 1, 1, 0], dtype=int64)" + ] + }, + "execution_count": 1243, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 1244, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constAgeSibSpParchFarePclass_2Pclass_3Sex_maleEmbarked_QEmbarked_S
7091.028.01115.245801100
4391.031.00010.500010101
8401.020.0007.925001101
7201.06.00133.000010001
391.014.01011.241701000
\n", + "
" + ], + "text/plain": [ + " const Age SibSp Parch Fare Pclass_2 Pclass_3 Sex_male \\\n", + "709 1.0 28.0 1 1 15.2458 0 1 1 \n", + "439 1.0 31.0 0 0 10.5000 1 0 1 \n", + "840 1.0 20.0 0 0 7.9250 0 1 1 \n", + "720 1.0 6.0 0 1 33.0000 1 0 0 \n", + "39 1.0 14.0 1 0 11.2417 0 1 0 \n", + "\n", + " Embarked_Q Embarked_S \n", + "709 0 0 \n", + "439 0 1 \n", + "840 0 1 \n", + "720 0 1 \n", + "39 0 0 " + ] + }, + "execution_count": 1244, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1245, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8067796610169492" + ] + }, + "execution_count": 1245, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 1246, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['const', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass_2', 'Pclass_3',\n", + " 'Sex_male', 'Embarked_Q', 'Embarked_S'],\n", + " dtype='object')" + ] + }, + "execution_count": 1246, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# scatter_matrix(X);\n", + "X.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 1247, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constAgeSibSpParchFareSex_maleEmbarked_QEmbarked_S
01.022.0107.2500101
11.038.01071.2833000
21.026.0007.9250001
31.035.01053.1000001
41.035.0008.0500101
\n", + "
" + ], + "text/plain": [ + " const Age SibSp Parch Fare Sex_male Embarked_Q Embarked_S\n", + "0 1.0 22.0 1 0 7.2500 1 0 1\n", + "1 1.0 38.0 1 0 71.2833 0 0 0\n", + "2 1.0 26.0 0 0 7.9250 0 0 1\n", + "3 1.0 35.0 1 0 53.1000 0 0 1\n", + "4 1.0 35.0 0 0 8.0500 1 0 1" + ] + }, + "execution_count": 1247, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = X.drop(columns=['Pclass_2', 'Pclass_3'])\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1248, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constAgeSibSpParchFareSex_maleEmbarked_QEmbarked_S
01.022.0107.2500101
11.038.01071.2833000
21.026.0007.9250001
31.035.01053.1000001
41.035.0008.0500101
\n", + "
" + ], + "text/plain": [ + " const Age SibSp Parch Fare Sex_male Embarked_Q Embarked_S\n", + "0 1.0 22.0 1 0 7.2500 1 0 1\n", + "1 1.0 38.0 1 0 71.2833 0 0 0\n", + "2 1.0 26.0 0 0 7.9250 0 0 1\n", + "3 1.0 35.0 1 0 53.1000 0 0 1\n", + "4 1.0 35.0 0 0 8.0500 1 0 1" + ] + }, + "execution_count": 1248, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X =pd.get_dummies(X)\n", + "# prob drop PassengerID and will need to drop one and dummy Sex, Embarked, Pclass\n", + "X = sm.add_constant(X)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1249, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 1250, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
OLS Regression Results
Dep. Variable: Survived R-squared: 0.334
Model: OLS Adj. R-squared: 0.326
Method: Least Squares F-statistic: 42.10
Date: Sun, 22 Jul 2018 Prob (F-statistic): 3.94e-48
Time: 15:14:03 Log-Likelihood: -291.45
No. Observations: 596 AIC: 598.9
Df Residuals: 588 BIC: 634.0
Df Model: 7
Covariance Type: nonrobust
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err t P>|t| [0.025 0.975]
const 0.8716 0.061 14.316 0.000 0.752 0.991
Age -0.0020 0.001 -1.494 0.136 -0.005 0.001
SibSp -0.0420 0.015 -2.754 0.006 -0.072 -0.012
Parch -0.0331 0.024 -1.390 0.165 -0.080 0.014
Fare 0.0015 0.000 4.405 0.000 0.001 0.002
Sex_male -0.5209 0.036 -14.432 0.000 -0.592 -0.450
Embarked_Q -0.1391 0.069 -2.025 0.043 -0.274 -0.004
Embarked_S -0.1363 0.045 -3.050 0.002 -0.224 -0.049
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Omnibus: 25.798 Durbin-Watson: 2.007
Prob(Omnibus): 0.000 Jarque-Bera (JB): 28.434
Skew: 0.535 Prob(JB): 6.69e-07
Kurtosis: 2.966 Cond. No. 328.


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Survived R-squared: 0.334\n", + "Model: OLS Adj. R-squared: 0.326\n", + "Method: Least Squares F-statistic: 42.10\n", + "Date: Sun, 22 Jul 2018 Prob (F-statistic): 3.94e-48\n", + "Time: 15:14:03 Log-Likelihood: -291.45\n", + "No. Observations: 596 AIC: 598.9\n", + "Df Residuals: 588 BIC: 634.0\n", + "Df Model: 7 \n", + "Covariance Type: nonrobust \n", + "==============================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "const 0.8716 0.061 14.316 0.000 0.752 0.991\n", + "Age -0.0020 0.001 -1.494 0.136 -0.005 0.001\n", + "SibSp -0.0420 0.015 -2.754 0.006 -0.072 -0.012\n", + "Parch -0.0331 0.024 -1.390 0.165 -0.080 0.014\n", + "Fare 0.0015 0.000 4.405 0.000 0.001 0.002\n", + "Sex_male -0.5209 0.036 -14.432 0.000 -0.592 -0.450\n", + "Embarked_Q -0.1391 0.069 -2.025 0.043 -0.274 -0.004\n", + "Embarked_S -0.1363 0.045 -3.050 0.002 -0.224 -0.049\n", + "==============================================================================\n", + "Omnibus: 25.798 Durbin-Watson: 2.007\n", + "Prob(Omnibus): 0.000 Jarque-Bera (JB): 28.434\n", + "Skew: 0.535 Prob(JB): 6.69e-07\n", + "Kurtosis: 2.966 Cond. No. 328.\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "\"\"\"" + ] + }, + "execution_count": 1250, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = sm.OLS(y_train, X_train)\n", + "results = model.fit()\n", + "results.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 1251, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", + " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 1251, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = LogisticRegression()\n", + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 1252, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,\n", + " 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n", + " 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,\n", + " 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,\n", + " 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,\n", + " 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,\n", + " 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,\n", + " 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,\n", + " 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,\n", + " 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 0, 1, 1, 0], dtype=int64)" + ] + }, + "execution_count": 1252, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 1253, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8033898305084746" + ] + }, + "execution_count": 1253, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 1254, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimization terminated successfully.\n", + " Current function value: 0.478634\n", + " Iterations 6\n" + ] + }, + { + "data": { + "text/plain": [ + "const 8.193931e-07\n", + "Age 1.201037e-01\n", + "SibSp 4.293742e-03\n", + "Parch 1.026211e-01\n", + "Fare 1.949962e-04\n", + "Sex_male 1.307261e-28\n", + "Embarked_Q 6.099504e-02\n", + "Embarked_S 5.288578e-03\n", + "dtype: float64" + ] + }, + "execution_count": 1254, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logitModel = sm.Logit(y_train, X_train)\n", + "logitModel_fit = logitModel.fit()\n", + "logitModel_fit.pvalues" + ] + }, + { + "cell_type": "code", + "execution_count": 1255, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['const', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q',\n", + " 'Embarked_S'],\n", + " dtype='object')" + ] + }, + "execution_count": 1255, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 1256, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constSibSpParchSex_male
01.0101
11.0100
21.0000
31.0100
41.0001
\n", + "
" + ], + "text/plain": [ + " const SibSp Parch Sex_male\n", + "0 1.0 1 0 1\n", + "1 1.0 1 0 0\n", + "2 1.0 0 0 0\n", + "3 1.0 1 0 0\n", + "4 1.0 0 0 1" + ] + }, + "execution_count": 1256, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = X.drop(columns=['Age', 'Fare', 'Embarked_Q', 'Embarked_S'])\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1257, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 1258, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
OLS Regression Results
Dep. Variable: Survived R-squared: 0.290
Model: OLS Adj. R-squared: 0.286
Method: Least Squares F-statistic: 80.57
Date: Sun, 22 Jul 2018 Prob (F-statistic): 1.02e-43
Time: 15:14:03 Log-Likelihood: -310.50
No. Observations: 596 AIC: 629.0
Df Residuals: 592 BIC: 646.6
Df Model: 3
Covariance Type: nonrobust
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err t P>|t| [0.025 0.975]
const 0.7630 0.032 23.732 0.000 0.700 0.826
SibSp -0.0357 0.015 -2.342 0.020 -0.066 -0.006
Parch -0.0200 0.024 -0.836 0.404 -0.067 0.027
Sex_male -0.5538 0.036 -15.174 0.000 -0.625 -0.482
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Omnibus: 22.278 Durbin-Watson: 2.020
Prob(Omnibus): 0.000 Jarque-Bera (JB): 24.206
Skew: 0.491 Prob(JB): 5.54e-06
Kurtosis: 2.904 Cond. No. 4.58


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Survived R-squared: 0.290\n", + "Model: OLS Adj. R-squared: 0.286\n", + "Method: Least Squares F-statistic: 80.57\n", + "Date: Sun, 22 Jul 2018 Prob (F-statistic): 1.02e-43\n", + "Time: 15:14:03 Log-Likelihood: -310.50\n", + "No. Observations: 596 AIC: 629.0\n", + "Df Residuals: 592 BIC: 646.6\n", + "Df Model: 3 \n", + "Covariance Type: nonrobust \n", + "==============================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "const 0.7630 0.032 23.732 0.000 0.700 0.826\n", + "SibSp -0.0357 0.015 -2.342 0.020 -0.066 -0.006\n", + "Parch -0.0200 0.024 -0.836 0.404 -0.067 0.027\n", + "Sex_male -0.5538 0.036 -15.174 0.000 -0.625 -0.482\n", + "==============================================================================\n", + "Omnibus: 22.278 Durbin-Watson: 2.020\n", + "Prob(Omnibus): 0.000 Jarque-Bera (JB): 24.206\n", + "Skew: 0.491 Prob(JB): 5.54e-06\n", + "Kurtosis: 2.904 Cond. No. 4.58\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "\"\"\"" + ] + }, + "execution_count": 1258, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = sm.OLS(y_train, X_train)\n", + "results = model.fit()\n", + "results.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 1259, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", + " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 1259, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = LogisticRegression()\n", + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 1260, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,\n", + " 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,\n", + " 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n", + " 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0,\n", + " 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,\n", + " 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,\n", + " 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,\n", + " 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0,\n", + " 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,\n", + " 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,\n", + " 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,\n", + " 1, 0, 0, 0, 0, 0, 1, 1, 0], dtype=int64)" + ] + }, + "execution_count": 1260, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 1261, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7966101694915254" + ] + }, + "execution_count": 1261, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 1262, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PclassSexSibSp
03male1
11female1
23female0
31female1
43male0
\n", + "
" + ], + "text/plain": [ + " Pclass Sex SibSp\n", + "0 3 male 1\n", + "1 1 female 1\n", + "2 3 female 0\n", + "3 1 female 1\n", + "4 3 male 0" + ] + }, + "execution_count": 1262, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = df.drop(columns=['Age', 'Fare', 'Embarked', 'Survived', 'Parch'])\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1263, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constSibSpPclass_1Pclass_2Pclass_3Sex_femaleSex_male
01.0100101
11.0110010
21.0000110
31.0110010
41.0000101
\n", + "
" + ], + "text/plain": [ + " const SibSp Pclass_1 Pclass_2 Pclass_3 Sex_female Sex_male\n", + "0 1.0 1 0 0 1 0 1\n", + "1 1.0 1 1 0 0 1 0\n", + "2 1.0 0 0 0 1 1 0\n", + "3 1.0 1 1 0 0 1 0\n", + "4 1.0 0 0 0 1 0 1" + ] + }, + "execution_count": 1263, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X =pd.get_dummies(X)\n", + "X = sm.add_constant(X)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1264, + "metadata": {}, + "outputs": [], + "source": [ + "X = X.drop(columns = ['Pclass_3', 'Sex_male'])" + ] + }, + { + "cell_type": "code", + "execution_count": 1265, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 1266, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
OLS Regression Results
Dep. Variable: Survived R-squared: 0.347
Model: OLS Adj. R-squared: 0.343
Method: Least Squares F-statistic: 78.63
Date: Sun, 22 Jul 2018 Prob (F-statistic): 1.81e-53
Time: 15:14:03 Log-Likelihood: -285.38
No. Observations: 596 AIC: 580.8
Df Residuals: 591 BIC: 602.7
Df Model: 4
Covariance Type: nonrobust
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
coef std err t P>|t| [0.025 0.975]
const 0.1110 0.025 4.443 0.000 0.062 0.160
SibSp -0.0298 0.013 -2.250 0.025 -0.056 -0.004
Pclass_1 0.2707 0.040 6.689 0.000 0.191 0.350
Pclass_2 0.1928 0.041 4.658 0.000 0.112 0.274
Sex_female 0.5118 0.034 14.935 0.000 0.444 0.579
\n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "\n", + " \n", + "\n", + "
Omnibus: 33.990 Durbin-Watson: 2.035
Prob(Omnibus): 0.000 Jarque-Bera (JB): 38.802
Skew: 0.625 Prob(JB): 3.75e-09
Kurtosis: 3.017 Cond. No. 4.59


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified." + ], + "text/plain": [ + "\n", + "\"\"\"\n", + " OLS Regression Results \n", + "==============================================================================\n", + "Dep. Variable: Survived R-squared: 0.347\n", + "Model: OLS Adj. R-squared: 0.343\n", + "Method: Least Squares F-statistic: 78.63\n", + "Date: Sun, 22 Jul 2018 Prob (F-statistic): 1.81e-53\n", + "Time: 15:14:03 Log-Likelihood: -285.38\n", + "No. Observations: 596 AIC: 580.8\n", + "Df Residuals: 591 BIC: 602.7\n", + "Df Model: 4 \n", + "Covariance Type: nonrobust \n", + "==============================================================================\n", + " coef std err t P>|t| [0.025 0.975]\n", + "------------------------------------------------------------------------------\n", + "const 0.1110 0.025 4.443 0.000 0.062 0.160\n", + "SibSp -0.0298 0.013 -2.250 0.025 -0.056 -0.004\n", + "Pclass_1 0.2707 0.040 6.689 0.000 0.191 0.350\n", + "Pclass_2 0.1928 0.041 4.658 0.000 0.112 0.274\n", + "Sex_female 0.5118 0.034 14.935 0.000 0.444 0.579\n", + "==============================================================================\n", + "Omnibus: 33.990 Durbin-Watson: 2.035\n", + "Prob(Omnibus): 0.000 Jarque-Bera (JB): 38.802\n", + "Skew: 0.625 Prob(JB): 3.75e-09\n", + "Kurtosis: 3.017 Cond. No. 4.59\n", + "==============================================================================\n", + "\n", + "Warnings:\n", + "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n", + "\"\"\"" + ] + }, + "execution_count": 1266, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = sm.OLS(y_train, X_train)\n", + "results = model.fit()\n", + "results.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 1267, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", + " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 1267, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = LogisticRegression()\n", + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 1268, + "metadata": {}, + "outputs": [], + "source": [ + "survival_predict = model.predict(X_test) #[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 1269, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8135593220338984" + ] + }, + "execution_count": 1269, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 1270, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['const', 'SibSp', 'Pclass_1', 'Pclass_2', 'Sex_female'], dtype='object')" + ] + }, + "execution_count": 1270, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 1271, + "metadata": {}, + "outputs": [], + "source": [ + "# best combo of variables I've gotten so far based on model score is above, \n", + "# including only siblings/spouses, passenger class, and sex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 1272, + "metadata": {}, + "outputs": [], + "source": [ + "# wanted to try the age bins\n", + "# first, getting an idea of where to bin" + ] + }, + { + "cell_type": "code", + "execution_count": 1273, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Survived01
Sexfemalemalefemalemale
Age
0.420001
0.920001
4.000352
8.001111
12.000001
15.000140
19.0001672
22.00214101
24.500100
28.00191244118
30.501100
33.000960
36.0001174
39.002741
42.000733
45.500200
49.000222
53.000010
56.000211
60.000211
64.000200
70.500100
\n", + "
" + ], + "text/plain": [ + "Survived 0 1 \n", + "Sex female male female male\n", + "Age \n", + "0.42 0 0 0 1\n", + "0.92 0 0 0 1\n", + "4.00 0 3 5 2\n", + "8.00 1 1 1 1\n", + "12.00 0 0 0 1\n", + "15.00 0 1 4 0\n", + "19.00 0 16 7 2\n", + "22.00 2 14 10 1\n", + "24.50 0 1 0 0\n", + "28.00 19 124 41 18\n", + "30.50 1 1 0 0\n", + "33.00 0 9 6 0\n", + "36.00 0 11 7 4\n", + "39.00 2 7 4 1\n", + "42.00 0 7 3 3\n", + "45.50 0 2 0 0\n", + "49.00 0 2 2 2\n", + "53.00 0 0 1 0\n", + "56.00 0 2 1 1\n", + "60.00 0 2 1 1\n", + "64.00 0 2 0 0\n", + "70.50 0 1 0 0" + ] + }, + "execution_count": 1273, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.pivot_table(index=['Age'], columns=('Survived','Sex'), aggfunc='size', fill_value=0)[::4]" + ] + }, + { + "cell_type": "code", + "execution_count": 1274, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constSibSpPclass_1Pclass_2Sex_female
01.01000
11.01101
21.00001
31.01101
41.00000
\n", + "
" + ], + "text/plain": [ + " const SibSp Pclass_1 Pclass_2 Sex_female\n", + "0 1.0 1 0 0 0\n", + "1 1.0 1 1 0 1\n", + "2 1.0 0 0 0 1\n", + "3 1.0 1 1 0 1\n", + "4 1.0 0 0 0 0" + ] + }, + "execution_count": 1274, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X =pd.get_dummies(X)\n", + "X = sm.add_constant(X)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1279, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
constSibSpPclass_1Pclass_2Sex_femaleage_binage_bin
01.01000NaN2
11.01101NaN2
21.00001NaN2
31.01101NaN2
41.00000NaN2
\n", + "
" + ], + "text/plain": [ + " const SibSp Pclass_1 Pclass_2 Sex_female age_bin age_bin\n", + "0 1.0 1 0 0 0 NaN 2\n", + "1 1.0 1 1 0 1 NaN 2\n", + "2 1.0 0 0 0 1 NaN 2\n", + "3 1.0 1 1 0 1 NaN 2\n", + "4 1.0 0 0 0 0 NaN 2" + ] + }, + "execution_count": 1279, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# variablizing age bins for testing\n", + "a = 0\n", + "b = 2\n", + "c = 82\n", + "# d = 82\n", + "age_bins = [a,b,c]\n", + "# age_bins = [0,1,16,82]\n", + "age_series = pd.cut(df['Age'], bins=age_bins, labels=age_bins[:-1])\n", + "age_series.name = 'age_bin'\n", + "X = pd.concat([X, age_series], axis=1)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 1276, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 1277, + "metadata": {}, + "outputs": [], + "source": [ + "# model = sm.OLS(y_train, X_train)\n", + "# results = model.fit()\n", + "# results.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 1278, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Input contains NaN, infinity or a value too large for dtype('float64').", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mLogisticRegression\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mmodel\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[0;32m 1214\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1215\u001b[0m X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype,\n\u001b[1;32m-> 1216\u001b[1;33m order=\"C\")\n\u001b[0m\u001b[0;32m 1217\u001b[0m \u001b[0mcheck_classification_targets\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1218\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclasses_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0munique\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0my\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[1;34m(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)\u001b[0m\n\u001b[0;32m 571\u001b[0m X = check_array(X, accept_sparse, dtype, order, copy, force_all_finite,\n\u001b[0;32m 572\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mallow_nd\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mensure_min_samples\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 573\u001b[1;33m ensure_min_features, warn_on_dtype, estimator)\n\u001b[0m\u001b[0;32m 574\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mmulti_output\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 575\u001b[0m y = check_array(y, 'csr', force_all_finite=True, ensure_2d=False,\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[1;34m(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)\u001b[0m\n\u001b[0;32m 451\u001b[0m % (array.ndim, estimator_name))\n\u001b[0;32m 452\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 453\u001b[1;33m \u001b[0m_assert_all_finite\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 454\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 455\u001b[0m \u001b[0mshape_repr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_shape_repr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0marray\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py\u001b[0m in \u001b[0;36m_assert_all_finite\u001b[1;34m(X)\u001b[0m\n\u001b[0;32m 42\u001b[0m and not np.isfinite(X).all()):\n\u001b[0;32m 43\u001b[0m raise ValueError(\"Input contains NaN, infinity\"\n\u001b[1;32m---> 44\u001b[1;33m \" or a value too large for %r.\" % X.dtype)\n\u001b[0m\u001b[0;32m 45\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mValueError\u001b[0m: Input contains NaN, infinity or a value too large for dtype('float64')." + ] + } + ], + "source": [ + "model = LogisticRegression()\n", + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "survival_predict = model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# best score only slightly better with bins for ages of 0, 1, 16, and 82" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}