From 83f124c515894267045e0308d8f40d6b059e8eff Mon Sep 17 00:00:00 2001 From: Renee Hlozek Date: Mon, 22 Apr 2019 14:33:24 -0400 Subject: [PATCH 1/9] updated science prize confusion matrix re-ordering, still issues with the labelling --- metrics_evaluation.ipynb | 185 +++++++++++++++++-------------------- paper/kaggle-run.ipynb | 26 ++++-- paper/main_paperfigs.ipynb | 37 ++++++-- 3 files changed, 131 insertions(+), 117 deletions(-) diff --git a/metrics_evaluation.ipynb b/metrics_evaluation.ipynb index 7b8163f..98ccc76 100644 --- a/metrics_evaluation.ipynb +++ b/metrics_evaluation.ipynb @@ -16,8 +16,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 1, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# import string\n", @@ -35,8 +37,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 2, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "import matplotlib as mpl\n", @@ -63,8 +67,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 3, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "metricslist = ['Brier', 'LogLoss']\n", @@ -89,8 +95,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 4, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "mystery = {}\n", @@ -107,8 +115,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 5, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "snphotcc = {}\n", @@ -130,8 +140,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 6, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "plasticc = {}\n", @@ -140,38 +152,16 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [ - "# old_snphotcc_names = []\n", - "# for prefix in ['templates_', 'wavelets_']:\n", - "# for suffix in ['boost_forest', 'knn', 'nb', 'neural_network', 'svm']:\n", - "# old_snphotcc_names.append(prefix+suffix+'.dat')\n", - "\n", - "# for i in range(len(snphotcc_names)):\n", - "# name = old_snphotcc_names[i]\n", - "# fileloc = dirname+'classifications/'+name\n", - "# snphotcc_info = pd.read_csv(fileloc, sep=' ')\n", - "# full = snphotcc_info.set_index('Object').join(truth_snphotcc.set_index('Object'))\n", - "# name = snphotcc_names[i]\n", - " \n", - "# truth = full['Type'] - 1\n", - "# snphotcc_truth_table = proclam.metrics.util.det_to_prob(truth)\n", - "# fileloc = 'examples/'+name+'/truth_table_'+name+'.csv'\n", - "# with open(fileloc, 'wb') as truth_place:\n", - "# np.savetxt(fileloc, snphotcc_truth_table, delimiter=' ')\n", - " \n", - "# probs = full[['1', '2', '3']]\n", - "# fileloc = 'examples/'+name+'/predicted_prob_'+name+'.csv'\n", - "# probs.to_csv(fileloc, sep=' ', index=False, header=True)" - ] + "source": [] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 7, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "# more_names = snphotcc_names\n", @@ -182,8 +172,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 8, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "def make_class_pairs(data_info_dict):\n", @@ -200,11 +192,21 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 9, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'label': 'ProClaM', 'names': ['Idealized', 'Guess', 'Tunnel', 'Broadbrush', 'Cruise', 'SubsumedTo', 'SubsumedFrom'], 'dirname': 'examples/ProClaM/', 'classifications': ['Idealized/predicted_prob_Idealized.csv', 'Guess/predicted_prob_Guess.csv', 'Tunnel/predicted_prob_Tunnel.csv', 'Broadbrush/predicted_prob_Broadbrush.csv', 'Cruise/predicted_prob_Cruise.csv', 'SubsumedTo/predicted_prob_SubsumedTo.csv', 'SubsumedFrom/predicted_prob_SubsumedFrom.csv'], 'truth_tables': ['Idealized/truth_table_Idealized.csv', 'Guess/truth_table_Guess.csv', 'Tunnel/truth_table_Tunnel.csv', 'Broadbrush/truth_table_Broadbrush.csv', 'Cruise/truth_table_Cruise.csv', 'SubsumedTo/truth_table_SubsumedTo.csv', 'SubsumedFrom/truth_table_SubsumedFrom.csv']}\n" + ] + } + ], "source": [ - "for dataset in [mystery, snphotcc, plasticc]:\n", + "for dataset in [ plasticc]: #mystery, snphotcc,\n", " dataset = make_file_locs(dataset)\n", " dataset['class_pairs'] = make_class_pairs(dataset)" ] @@ -220,8 +222,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 10, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "def plot_cm(probs, truth, name, loc=''):\n", @@ -233,14 +237,17 @@ " plt.ylabel('true class')\n", " plt.colorbar()\n", " plt.title(name)\n", - " plt.savefig(loc+name+'_cm.png')\n", - " plt.close()" + " #plt.savefig(loc+name+'_cm.png')\n", + " plt.show()\n", + " #plt.close()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 11, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "def read_class_pairs(pair, dataset, cc):#loc='', title=''):\n", @@ -255,6 +262,7 @@ " nobj_truth = np.shape(truth_values)[0]\n", " nclass_truth = np.shape(truth_values)[1]\n", " tvec = np.where(truth_values==1)[1]\n", + " print(tvec)\n", "# if nclass_truth!= nclass:\n", "# print('Truth table of size %i x %i and prob matrix of size %i x %i do not match up in size'%(nobj,nclass,nobj_truth,nclass_truth))\n", "# else:\n", @@ -266,8 +274,10 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 12, + "metadata": { + "scrolled": true + }, "outputs": [], "source": [ "def make_patch_spines_invisible(ax):\n", @@ -316,6 +326,7 @@ " plt.legend(handles, metric_names)\n", " plt.suptitle(title)\n", " plt.savefig(fileloc)\n", + " plt.show()\n", " return" ] }, @@ -328,13 +339,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 13, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'class_pairs'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdataset\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmystery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msnphotcc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplasticc\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mempty\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetricslist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mcc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpair\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'class_pairs'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpair\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprobm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtruthv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread_class_pairs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpair\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m#loc=dataset['dirname'], title=dataset['label']+' '+dataset['names'][cc])\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyError\u001b[0m: 'class_pairs'" + ] + } + ], "source": [ "for dataset in [mystery, snphotcc, plasticc]:\n", " data = np.empty((len(metricslist), len(dataset['names'])))\n", " for cc, pair in enumerate(dataset['class_pairs']):\n", + " print(pair)\n", " probm, truthv = read_class_pairs(pair, dataset, cc)#loc=dataset['dirname'], title=dataset['label']+' '+dataset['names'][cc])\n", " for count, metric in enumerate(metricslist):\n", " D = getattr(proclam.metrics, metric)()\n", @@ -344,45 +370,6 @@ " metric_plot(dataset, metricslist, markerlist, colors)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# more_data = np.empty((len(metricslist), len(more_names)))\n", - "# for cc, pair in enumerate(more_class_pairs):\n", - "# probm, truthv = read_class_pairs(pair, dirname)\n", - "# for count, metric in enumerate(metricslist):\n", - "# D = getattr(proclam.metrics, metric)()\n", - "# hm = D.evaluate(probm, truthv)\n", - "# more_data[count][cc] = hm" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# metric_plot(more_names, metricslist, more_data, markerlist, colors, title='SNPhotCC', fileloc=dirname+'snphotccdata.png')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# data = np.empty((len(metricslist), len(names)))\n", - "# for cc, pair in enumerate(class_pairs):\n", - "# probm, truthv = read_class_pairs(pair, dirname)\n", - "# for count, metric in enumerate(metricslist):\n", - "# D = getattr(proclam.metrics, metric)()\n", - "# hm = D.evaluate(probm, truthv)\n", - "# data[count][cc] = hm" - ] - }, { "cell_type": "code", "execution_count": null, @@ -390,9 +377,7 @@ "scrolled": true }, "outputs": [], - "source": [ - "# metric_plot(names, metricslist, data, markerlist, colors, title='Mystery Dataset', fileloc=dirname+'mysterydata.png')" - ] + "source": [] }, { "cell_type": "code", @@ -419,7 +404,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/paper/kaggle-run.ipynb b/paper/kaggle-run.ipynb index 5f32d29..f8ce066 100644 --- a/paper/kaggle-run.ipynb +++ b/paper/kaggle-run.ipynb @@ -175,12 +175,12 @@ { "data": { "text/plain": [ - "{'KNeighbors': ['KNeighbors/predicted_prob_KNeighbors.csv',\n", + "{'RandomForest': ['RandomForest/predicted_prob_RandomForest.csv',\n", + " 'RandomForest/truth_table_RandomForest.csv'],\n", + " 'KNeighbors': ['KNeighbors/predicted_prob_KNeighbors.csv',\n", " 'KNeighbors/truth_table_KNeighbors.csv'],\n", " 'MLPNeuralNet': ['MLPNeuralNet/predicted_prob_MLPNeuralNet.csv',\n", - " 'MLPNeuralNet/truth_table_MLPNeuralNet.csv'],\n", - " 'RandomForest': ['RandomForest/predicted_prob_RandomForest.csv',\n", - " 'RandomForest/truth_table_RandomForest.csv']}" + " 'MLPNeuralNet/truth_table_MLPNeuralNet.csv']}" ] }, "execution_count": 5, @@ -212,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -622,9 +622,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "KNeighbors with weights [0.05882353 0.11764706 0.05882353 0.05882353 0.05882353 0.05882353\n", + " 0.05882353 0.05882353 0.05882353 0.05882353 0.11764706 0.11764706\n", + " 0.11764706] has LogLoss = 20.749255306361132\n" + ] + } + ], "source": [ "# This is how you run the metric with a random weight vector.\n", "\n", @@ -703,7 +713,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/paper/main_paperfigs.ipynb b/paper/main_paperfigs.ipynb index 476ae20..75f2287 100644 --- a/paper/main_paperfigs.ipynb +++ b/paper/main_paperfigs.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "tags": [ "hideme" @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "tags": [ "hideme" @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -121,14 +121,33 @@ { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": { "tags": [ "hideme" ] }, - "outputs": [], - "source": [ - "import proclam\n", + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'proclam'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#import proclam\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mproclam\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'proclam'" + ] + } + ], + "source": [ + "#import proclam\n", "from proclam import *" ] }, @@ -1913,9 +1932,9 @@ "anaconda-cloud": {}, "celltoolbar": "Tags", "kernelspec": { - "display_name": "ProClaM (Python 3)", + "display_name": "Python 3", "language": "python", - "name": "proclam_3" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1927,7 +1946,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.8" } }, "nbformat": 4, From 308f1975fe95f68d3fcf7761d83559e40d42698b Mon Sep 17 00:00:00 2001 From: Renee Hlozek Date: Tue, 23 Apr 2019 16:50:39 -0400 Subject: [PATCH 2/9] indentation util --- proclam/metrics/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py index db2e5e9..60cc054 100644 --- a/proclam/metrics/util.py +++ b/proclam/metrics/util.py @@ -398,6 +398,6 @@ def auc(x, y): y = np.concatenate(([0.], y, [1.]),) i = np.argsort(x) - auc = trapz(y[i], x[i]) + auc = trapz(y[i], x[i]) return auc From 7c18574eccf1870e7e70f0d2d5879a6863026435 Mon Sep 17 00:00:00 2001 From: Renee Hlozek Date: Tue, 23 Apr 2019 16:53:44 -0400 Subject: [PATCH 3/9] indendation --- proclam/metrics/util.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py index 60cc054..9ed65e5 100644 --- a/proclam/metrics/util.py +++ b/proclam/metrics/util.py @@ -381,23 +381,23 @@ def recall(classifications,truth,class_idx): def auc(x, y): """ Computes the area under curve (just a wrapper for trapezoid rule) - - Parameters - ---------- - x: numpy.ndarray, int or float - - y: numpy.ndarray, int or float - - Returns - ------- - rates: named tuple, float + + Parameters + ---------- + x: numpy.ndarray, int or float + + y: numpy.ndarray, int or float + + Returns + ------- + rates: named tuple, float RateMatrix named tuple - """ - + """ + x = np.concatenate(([0.], x, [1.]),) y = np.concatenate(([0.], y, [1.]),) - + i = np.argsort(x) auc = trapz(y[i], x[i]) - + return auc From 86ed0df3e254f634cb3bb5f199caf7d515f6b04f Mon Sep 17 00:00:00 2001 From: Renee Hlozek Date: Tue, 23 Apr 2019 16:55:58 -0400 Subject: [PATCH 4/9] indentation util --- proclam/metrics/util.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py index 9ed65e5..d291dd8 100644 --- a/proclam/metrics/util.py +++ b/proclam/metrics/util.py @@ -379,25 +379,25 @@ def recall(classifications,truth,class_idx): return tp/(tp+fn) def auc(x, y): - """ - Computes the area under curve (just a wrapper for trapezoid rule) - - Parameters - ---------- - x: numpy.ndarray, int or float - - y: numpy.ndarray, int or float - - Returns - ------- - rates: named tuple, float - RateMatrix named tuple - """ - - x = np.concatenate(([0.], x, [1.]),) - y = np.concatenate(([0.], y, [1.]),) + """ + Computes the area under curve (just a wrapper for trapezoid rule) + + Parameters + ---------- + x: numpy.ndarray, int or float - i = np.argsort(x) - auc = trapz(y[i], x[i]) + y: numpy.ndarray, int or float + + Returns + ------- + rates: named tuple, float + RateMatrix named tuple + """ + + x = np.concatenate(([0.], x, [1.]),) + y = np.concatenate(([0.], y, [1.]),) + + i = np.argsort(x) + auc = trapz(y[i], x[i]) - return auc + return auc From d8fbcf5f746c3beef59dc3ebd37adad38f5a5b21 Mon Sep 17 00:00:00 2001 From: Renee Hlozek Date: Tue, 21 May 2019 16:35:22 -0400 Subject: [PATCH 5/9] metrics code to plot for plasticc science submission --- proclam/metrics/util.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py index d291dd8..1f356c3 100644 --- a/proclam/metrics/util.py +++ b/proclam/metrics/util.py @@ -141,10 +141,15 @@ def det_to_cm(dets, truth, per_class_norm=True, vb=False): ----- I need to fix the norm keyword all around to enable more options, like normed output vs. not. """ + + print(truth) pred_classes, pred_counts = np.unique(dets, return_counts=True) true_classes, true_counts = np.unique(truth, return_counts=True) + + if vb: print('by request '+str((pred_classes, pred_counts), (true_classes, true_counts))) + print(pred_classes, true_classes, 'huh') M = np.int(max(max(pred_classes), max(true_classes)) + 1) if vb: print('by request '+str((np.shape(dets), np.shape(truth)), M)) @@ -193,6 +198,7 @@ def prob_to_cm(probs, truth, per_class_norm=True, vb=False): """ dets = prob_to_det(probs) + print(truth, 'huh 1') cm = det_to_cm(dets, truth, per_class_norm=per_class_norm, vb=vb) return cm From e6e46ca8377e24d1314977a571a9afc4db1df9e6 Mon Sep 17 00:00:00 2001 From: Renee Hlozek Date: Fri, 7 Jun 2019 09:44:30 -0400 Subject: [PATCH 6/9] adjusted weight in brier and logloss --- proclam/metrics/brier.py | 8 ++++++-- proclam/metrics/logloss.py | 12 +++++++++--- proclam/metrics/util.py | 18 +++++++++++------- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/proclam/metrics/brier.py b/proclam/metrics/brier.py index b06f923..f7b8f6b 100644 --- a/proclam/metrics/brier.py +++ b/proclam/metrics/brier.py @@ -27,7 +27,7 @@ def __init__(self, scheme=None): super(Brier, self).__init__(scheme) - def evaluate(self, prediction, truth, averaging='per_class'): + def evaluate(self, prediction, truth, weightvector, averaging='per_class'): """ Evaluates the Brier score @@ -37,6 +37,8 @@ def evaluate(self, prediction, truth, averaging='per_class'): predicted class probabilities truth: numpy.ndarray, int true classes + wieghtvec: numpy.ndarray, int + relative weights averaging: string, optional 'per_class' weights classes equally, other keywords possible vector assumed to be class weights @@ -60,7 +62,9 @@ def evaluate(self, prediction, truth, averaging='per_class'): q_each = (prediction - truth_mask) ** 2 class_brier = averager(q_each, truth, M) - metric = weight_sum(class_brier, weight_vector=weights) + weight_vector=weights*weightvector + + metric = weight_sum(class_brier, weight_vector) assert(~np.isnan(metric)) diff --git a/proclam/metrics/logloss.py b/proclam/metrics/logloss.py index a3fdc29..bed540b 100644 --- a/proclam/metrics/logloss.py +++ b/proclam/metrics/logloss.py @@ -30,7 +30,7 @@ def __init__(self, scheme=None): super(LogLoss, self).__init__(scheme) self.scheme = scheme - def evaluate(self, prediction, truth, averaging='per_class'): + def evaluate(self, prediction, truth, weightvector, averaging='per_class'): """ Evaluates the log-loss @@ -40,6 +40,8 @@ def evaluate(self, prediction, truth, averaging='per_class'): predicted class probabilities truth: numpy.ndarray, int true classes + weightvector: numpy.ndarray, float + per class weights averaging: string or numpy.ndarray, float 'per_class' weights classes equally, other keywords possible vector assumed to be class weights @@ -53,11 +55,14 @@ def evaluate(self, prediction, truth, averaging='per_class'): ----- This uses the natural log. """ + print(weightvector, 'checking') + prediction, truth = np.asarray(prediction), np.asarray(truth) prediction_shape = np.shape(prediction) (N, M) = prediction_shape weights = check_weights(averaging, M, truth=truth) + print('average weights', weights) truth_mask = truth_reformatter(truth, prediction) prediction = sanitize_predictions(prediction) @@ -67,8 +72,9 @@ def evaluate(self, prediction, truth, averaging='per_class'): # use a better structure for checking keyword support class_logloss = averager(logloss_each, truth, M) - - logloss = weight_sum(class_logloss, weight_vector=weights) + weight_vector = weights*weightvector + print('ok ready to go', weight_vector) + logloss = weight_sum(class_logloss, weight_vector=weight_vector) #=weights) assert(~np.isnan(logloss)) diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py index 1f356c3..37a0fa7 100644 --- a/proclam/metrics/util.py +++ b/proclam/metrics/util.py @@ -142,29 +142,32 @@ def det_to_cm(dets, truth, per_class_norm=True, vb=False): I need to fix the norm keyword all around to enable more options, like normed output vs. not. """ - print(truth) pred_classes, pred_counts = np.unique(dets, return_counts=True) true_classes, true_counts = np.unique(truth, return_counts=True) - if vb: print('by request '+str((pred_classes, pred_counts), (true_classes, true_counts))) - print(pred_classes, true_classes, 'huh') +# print(pred_classes, true_classes, 'huh') M = np.int(max(max(pred_classes), max(true_classes)) + 1) if vb: print('by request '+str((np.shape(dets), np.shape(truth)), M)) cm = np.zeros((M, M), dtype=float) coords = np.array(list(zip(dets, truth))) +# print(coords, 'huzzah') + #print(np.shape(coords), 'shape coords') indices, index_counts = np.unique(coords, axis=0, return_counts=True) index_counts = index_counts.astype(int) + #print(index_counts, 'index_counts') + #print(indices, 'indices') if vb: print('by request '+str(index_counts)) # if vb: print(indices, index_counts) indices = indices.T # if vb: print(np.shape(indices)) + #print(cm, 'yo') cm[indices[0], indices[1]] = index_counts - # if vb: print(cm) - + #if vb: print(cm) + #print(cm, 'hi') if per_class_norm: # print(type(cm)) # print(type(true_counts)) @@ -198,7 +201,7 @@ def prob_to_cm(probs, truth, per_class_norm=True, vb=False): """ dets = prob_to_det(probs) - print(truth, 'huh 1') +# print(truth, 'huh 1') cm = det_to_cm(dets, truth, per_class_norm=per_class_norm, vb=vb) return cm @@ -268,7 +271,7 @@ def weight_sum(per_class_metrics, weight_vector, norm=True): ---------- per_class_metrics: numpy.float the scores separated by class (a list of arrays) - weight_vector: numpy.ndarray floar + weight_vector: numpy.ndarray float The array of weights per class norm: boolean, optional @@ -279,6 +282,7 @@ def weight_sum(per_class_metrics, weight_vector, norm=True): """ weight_sum = np.dot(weight_vector, per_class_metrics) + #print(weight_sum, 'weight_sum') return weight_sum def check_weights(avg_info, M, chosen=None, truth=None): From 0d5c63bd30f4946af88911f8037d765f5abb2cd8 Mon Sep 17 00:00:00 2001 From: reneehlozek Date: Wed, 11 Sep 2019 09:32:51 -0400 Subject: [PATCH 7/9] added metrics plots for results --- metrics_plots.py | 207 ++++++++++++++++++++++++++++++++++++++++ proclam/metrics/util.py | 2 + 2 files changed, 209 insertions(+) create mode 100644 metrics_plots.py diff --git a/metrics_plots.py b/metrics_plots.py new file mode 100644 index 0000000..be1d7be --- /dev/null +++ b/metrics_plots.py @@ -0,0 +1,207 @@ +# import string +# import itertools +# import random +# import os +# import csv + +import numpy as np +import pandas as pd + +import proclam +from proclam import * +import matplotlib as mpl +import pylab as pl +mpl.use('Agg') +mpl.rcParams['text.usetex'] = False +mpl.rcParams['mathtext.rm'] = 'serif' +mpl.rcParams['font.family'] = 'serif' +mpl.rcParams['font.serif'] = 'Times New Roman' +mpl.rcParams['axes.titlesize'] = 16 +mpl.rcParams['axes.labelsize'] = 14 +mpl.rcParams['savefig.dpi'] = 250 +mpl.rcParams['savefig.format'] = 'pdf' +mpl.rcParams['savefig.bbox'] = 'tight' +import matplotlib.pyplot as plt +metricslist = ['Brier', 'LogLoss'] +colors = ['teal', 'magenta'] +dirname = 'examples/' +markerlist = ['d', 'o', 's', '*'] +plasticc = {} +plasticc['label'] = 'plasticc' +#plasticc['names'] = ['Submission_alpha_0.5_190516_1756', 'submission_40_avocado', 'submission_probe99_40_avocado'] +plasticc['names'] = ['3_MajorTom'] #'2_MikeSilogram' ] #'1_Kyle'] + +#, '2_MikeSilogram', '3_MajorTom'] + + + +list = [6,15,16,42,52,53,62,64,65,67,88,90,92,95,99] +itemlist=['uLens-Point', 'TDE', 'EBE', 'SNCC-II', 'MIRA', 'SNCC-Ibc', 'KN', 'Mdwarf', 'SNIa-91bg', 'AGN', 'SNIa-normal', 'RRlyrae', 'SLSN-I', 'Other'] + +choices=['All'] + +#, 'uLens-Point', 'TDE', 'EBE', 'SNCC-II', 'MIRA', 'SNCC-Ibc', 'KN', 'Mdwarf', 'SNIa-91bg', 'AGN', 'SNIa-normal', 'RRlyrae', 'SLSN-I', 'Other'] + +# 90 SNIa-normal +#62 SNCC-Ibc +#42 SNCC-II +#67 SNIa-91bg +#52 SNIa-x +#64 KN +#95 SLSN-I +#99 Other +#15 TDE +#88 AGN +#92 RRlyrae +#65 Mdwarf +#16 EBE +#53 MIRA +#6 uLens-Point + + +#Idealized', 'Guess', 'Tunnel', 'Broadbrush', 'Cruise', 'SubsumedTo', 'SubsumedFrom'] +def make_class_pairs(data_info_dict): + return zip(data_info_dict['classifications'], data_info_dict['truth_tables']) + +def make_file_locs(data_info_dict): + names = data_info_dict['names'] + data_info_dict['dirname'] = dirname + data_info_dict['label'] + '/' +# data_info_dict['classifications'] = ['%s/predicted_prob_%s.csv'%(name, name) for name in names] +# data_info_dict['truth_tables'] = ['%s/truth_table_%s.csv'%(name, name) for name in names] + data_info_dict['classifications'] = ['%s/%s.csv'%(name, name) for name in names] + data_info_dict['truth_tables'] = ['%s/%s_truth.csv'%(name, name) for name in names] + print(data_info_dict) + return data_info_dict + +def plot_cm(probs, truth, name, loc=''): + print(np.shape(probs), np.shape(truth), 'checking sizes of probs and truth') + cm = proclam.metrics.util.prob_to_cm(probs, truth) + pl.clf() + plt.matshow(cm.T, vmin=0., vmax=1.) +# plt.xticks(range(max(truth)+1), names) +# plt.yticks(range(max(truth)+1), names) + plt.xlabel('predicted class') + plt.ylabel('true class') + plt.colorbar() + plt.title(name) + plt.savefig(loc+name+'_cm.png') + #plt.show() + #plt.close() + + + +def read_class_pairs(pair, dataset, cc):#loc='', title=''): + loc=dataset['dirname'] + title=dataset['label']+' '+ dataset['names'][cc] + clfile = pair[0] + truthfile = pair[1] + print(clfile, truthfile) + prob_mat = pd.read_csv(loc+clfile)#, delim_whitespace=True) + nobj = np.shape(prob_mat)[0] + nclass = np.shape(prob_mat)[1]-1 #since they have object ID as an element + cols=prob_mat.columns.tolist() + + objid = prob_mat[cols[0]] + pmat=np.array(prob_mat[cols[1:]]) + + truth_values = pd.read_csv(loc+truthfile) #, delim_whitespace=True) + nobj_truth = np.shape(truth_values)[0] + nclass_truth = np.shape(truth_values)[1]-1 + truvals = np.array(truth_values[cols[1:]]) + tvec = np.where(truvals==1)[1] + + +# pmat = prob_mat[:,1:] + plot_cm(pmat, tvec, title, loc=loc+dataset['names'][cc]+'/') + return pmat, tvec + + +def make_patch_spines_invisible(ax): + ax.set_frame_on(True) + ax.patch.set_visible(False) + for sp in ax.spines.values(): + sp.set_visible(False) + +def per_metric_helper(ax, n, data, metric_names, codes, shapes, colors): + plot_n = n+1 + in_x = np.arange(len(codes)) + ax_n = ax + n_factor = 0.1 * (plot_n - 2) + if plot_n>1: + ax_n = ax.twinx() + rot_ang = 270 + label_space = 15. + else: + rot_ang = 90 + label_space = 0. + if plot_n>2: + ax_n.spines["right"].set_position(("axes", 1. + 0.1 * (plot_n-1))) + make_patch_spines_invisible(ax_n) + ax_n.spines["right"].set_visible(True) + handle = ax_n.scatter(in_x+n_factor*np.ones_like(data[n]), data[n], marker=shapes[n], s=10, color=colors[n], label=metric_names[n]) + ax_n.set_ylabel(metric_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space) +# ax_n.set_ylim(0.9 * min(data[n]), 1.1 * max(data[n])) + return(ax, ax_n, handle) + +def metric_plot(dataset, metric_names, shapes, colors, choice): + codes = dataset['names'] + data = dataset['results'] + title = dataset['label']+' results focusing on: '+str(choice) + fileloc = dataset['dirname']+dataset['label']+'_'+str(choice)+'_results.png' + xs = np.arange(len(codes)) + pl.clf() + fig, ax = plt.subplots() + fig.subplots_adjust(right=1.) + handles = [] + for n in range(len(metric_names)): + (ax, ax_n, handle) = per_metric_helper(ax, n, data, metric_names, codes, shapes, colors) + handles.append(handle) + plt.xticks(xs, codes) + for tick in ax.get_xticklabels(): + tick.set_rotation(90) + plt.xlabel('Classifiers', fontsize=14) + leg=plt.legend(handles, metric_names, numpoints=1, loc='lower right') + #leg.draw_frame(False) + plt.suptitle(title) + plt.savefig(fileloc) + #plt.show() + return + + +for dataset in [ plasticc]: #mystery, snphotcc, + dataset = make_file_locs(dataset) + dataset['class_pairs'] = make_class_pairs(dataset) + + +for choice in choices: + pl.clf() + + if choice=='All': + print('ignoring weights for %s'%choice) + #weights= None + weights=np.ones(len(list)) + print(len(list), 'length of list') + else: + weights= np.zeros(len(list)) #1e-5*np.ones(len(list)) + ind = itemlist.index(choice) + print(itemlist) + print(ind, 'check ind', choice) + print(itemlist[ind], choice, 'checking choice') + weights[ind]=1.0 + for dataset in [plasticc]: + data = np.empty((len(metricslist), len(dataset['names']))) + + for cc, pair in enumerate(dataset['class_pairs']): + probm, truthv = read_class_pairs(pair, dataset, cc) + + for count, metric in enumerate(metricslist): + print(weights, 'checking huh') + print(len(weights), 'how many weights?') + D = getattr(proclam.metrics, metric, weights)() + hm = D.evaluate(probm, truthv, weights) + data[count][cc] = hm + dataset['results'] = data + + metric_plot(dataset, metricslist, markerlist, colors, choice) + +#----------------------------------------------------- diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py index 37a0fa7..cc55d26 100644 --- a/proclam/metrics/util.py +++ b/proclam/metrics/util.py @@ -168,9 +168,11 @@ def det_to_cm(dets, truth, per_class_norm=True, vb=False): cm[indices[0], indices[1]] = index_counts #if vb: print(cm) #print(cm, 'hi') + if per_class_norm: # print(type(cm)) # print(type(true_counts)) + print(np.shape(cm), np.shape(true_counts), 'shapes') # cm = cm / true_counts # cm /= true_counts[:, np.newaxis] # cm = cm / true_counts[np.newaxis, :] From 82d48d3936909c5f89e84dc039b60c83bec06134 Mon Sep 17 00:00:00 2001 From: Renee Hlozek Date: Wed, 4 Dec 2019 08:51:36 -0500 Subject: [PATCH 8/9] code to adjust csv files --- organize_csv.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 organize_csv.py diff --git a/organize_csv.py b/organize_csv.py new file mode 100644 index 0000000..c58f6d0 --- /dev/null +++ b/organize_csv.py @@ -0,0 +1,48 @@ +import pylab as pl +import pandas as pd +import numpy as np + +list = ['2_MikeSilogram', '3_MajorTom'] + +kyle = pd.read_csv('examples/plasticc/1_Kyle/1_Kyle.csv') +kylecols = kyle.columns.tolist() + + +matstruc = kyle.copy() +matstrucdat = np.array(matstruc) +indexlist = np.zeros(len(kylecols)) +print(kylecols) +print(matstrucdat[0,:]) + + +truth = pd.read_csv('1_Kyle/1_Kyle_truth.csv') + +for file in list: + name = file+'/'+file+'.csv' + print(name) + mat = pd.read_csv(name) + matdat = np.array(mat) + + cols = mat.columns.tolist() + index = mat.index + #print(np.shape(mat)) + print(cols) + print(matdat[0,:], 'test before') + + + for i in range(len(kylecols)): +# df[col] = df[col].replace(findL, replaceL) + for j in range(len(cols)): + if cols[j]==kylecols[i]: + indexlist[i]=j + matstrucdat[:,i] = matdat[:,j] + print(i, j, cols[j], kylecols[i]) + + matstruc.rename(columns={"A": "a", "B": "c"}) + print(matstrucdat[0,:], 'test after') + + newname = file+'/'+file+'_reordered.csv' + newnametruth = file+'/'+file+'_reordered_truth.csv' + + matstruc.to_csv(newname, index=False) + truth.to_csv(newnametruth) From 6f2597f43b723ebba2e2ac247f45346aff0c6c87 Mon Sep 17 00:00:00 2001 From: Renee Hlozek Date: Wed, 4 Dec 2019 09:51:52 -0500 Subject: [PATCH 9/9] updated paths --- organize_csv.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/organize_csv.py b/organize_csv.py index c58f6d0..a98a4f8 100644 --- a/organize_csv.py +++ b/organize_csv.py @@ -1,7 +1,7 @@ import pylab as pl import pandas as pd import numpy as np - +dir = 'examples/plasticc/' list = ['2_MikeSilogram', '3_MajorTom'] kyle = pd.read_csv('examples/plasticc/1_Kyle/1_Kyle.csv') @@ -15,10 +15,10 @@ print(matstrucdat[0,:]) -truth = pd.read_csv('1_Kyle/1_Kyle_truth.csv') +truth = pd.read_csv('examples/plasticc/1_Kyle/1_Kyle_truth.csv') for file in list: - name = file+'/'+file+'.csv' + name = dir+file+'/'+file+'.csv' print(name) mat = pd.read_csv(name) matdat = np.array(mat) @@ -41,8 +41,8 @@ matstruc.rename(columns={"A": "a", "B": "c"}) print(matstrucdat[0,:], 'test after') - newname = file+'/'+file+'_reordered.csv' - newnametruth = file+'/'+file+'_reordered_truth.csv' + newname = dir+file+'/'+file+'_reordered.csv' + newnametruth = dir+file+'/'+file+'_reordered_truth.csv' matstruc.to_csv(newname, index=False) truth.to_csv(newnametruth)