From 83f124c515894267045e0308d8f40d6b059e8eff Mon Sep 17 00:00:00 2001
From: Renee Hlozek <renee.hlozek@gmail.com>
Date: Mon, 22 Apr 2019 14:33:24 -0400
Subject: [PATCH 1/9] updated science prize confusion matrix re-ordering, still
 issues with the labelling

---
 metrics_evaluation.ipynb   | 185 +++++++++++++++++--------------------
 paper/kaggle-run.ipynb     |  26 ++++--
 paper/main_paperfigs.ipynb |  37 ++++++--
 3 files changed, 131 insertions(+), 117 deletions(-)

diff --git a/metrics_evaluation.ipynb b/metrics_evaluation.ipynb
index 7b8163f..98ccc76 100644
--- a/metrics_evaluation.ipynb
+++ b/metrics_evaluation.ipynb
@@ -16,8 +16,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 1,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "# import string\n",
@@ -35,8 +37,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 2,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "import matplotlib as mpl\n",
@@ -63,8 +67,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 3,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "metricslist = ['Brier', 'LogLoss']\n",
@@ -89,8 +95,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 4,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "mystery = {}\n",
@@ -107,8 +115,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 5,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "snphotcc = {}\n",
@@ -130,8 +140,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 6,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "plasticc = {}\n",
@@ -140,38 +152,16 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "# old_snphotcc_names = []\n",
-    "# for prefix in ['templates_', 'wavelets_']:\n",
-    "#     for suffix in ['boost_forest', 'knn', 'nb', 'neural_network', 'svm']:\n",
-    "#         old_snphotcc_names.append(prefix+suffix+'.dat')\n",
-    "\n",
-    "# for i in range(len(snphotcc_names)):\n",
-    "#     name = old_snphotcc_names[i]\n",
-    "#     fileloc = dirname+'classifications/'+name\n",
-    "#     snphotcc_info = pd.read_csv(fileloc, sep=' ')\n",
-    "#     full = snphotcc_info.set_index('Object').join(truth_snphotcc.set_index('Object'))\n",
-    "#     name = snphotcc_names[i]\n",
-    "    \n",
-    "#     truth = full['Type'] - 1\n",
-    "#     snphotcc_truth_table = proclam.metrics.util.det_to_prob(truth)\n",
-    "#     fileloc = 'examples/'+name+'/truth_table_'+name+'.csv'\n",
-    "#     with open(fileloc, 'wb') as truth_place:\n",
-    "#         np.savetxt(fileloc, snphotcc_truth_table, delimiter=' ')\n",
-    "    \n",
-    "#     probs = full[['1', '2', '3']]\n",
-    "#     fileloc = 'examples/'+name+'/predicted_prob_'+name+'.csv'\n",
-    "#     probs.to_csv(fileloc, sep=' ', index=False, header=True)"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 7,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "# more_names = snphotcc_names\n",
@@ -182,8 +172,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 8,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "def make_class_pairs(data_info_dict):\n",
@@ -200,11 +192,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 9,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'label': 'ProClaM', 'names': ['Idealized', 'Guess', 'Tunnel', 'Broadbrush', 'Cruise', 'SubsumedTo', 'SubsumedFrom'], 'dirname': 'examples/ProClaM/', 'classifications': ['Idealized/predicted_prob_Idealized.csv', 'Guess/predicted_prob_Guess.csv', 'Tunnel/predicted_prob_Tunnel.csv', 'Broadbrush/predicted_prob_Broadbrush.csv', 'Cruise/predicted_prob_Cruise.csv', 'SubsumedTo/predicted_prob_SubsumedTo.csv', 'SubsumedFrom/predicted_prob_SubsumedFrom.csv'], 'truth_tables': ['Idealized/truth_table_Idealized.csv', 'Guess/truth_table_Guess.csv', 'Tunnel/truth_table_Tunnel.csv', 'Broadbrush/truth_table_Broadbrush.csv', 'Cruise/truth_table_Cruise.csv', 'SubsumedTo/truth_table_SubsumedTo.csv', 'SubsumedFrom/truth_table_SubsumedFrom.csv']}\n"
+     ]
+    }
+   ],
    "source": [
-    "for dataset in [mystery, snphotcc, plasticc]:\n",
+    "for dataset in [ plasticc]: #mystery, snphotcc,\n",
     "    dataset = make_file_locs(dataset)\n",
     "    dataset['class_pairs'] = make_class_pairs(dataset)"
    ]
@@ -220,8 +222,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 10,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "def plot_cm(probs, truth, name, loc=''):\n",
@@ -233,14 +237,17 @@
     "    plt.ylabel('true class')\n",
     "    plt.colorbar()\n",
     "    plt.title(name)\n",
-    "    plt.savefig(loc+name+'_cm.png')\n",
-    "    plt.close()"
+    "    #plt.savefig(loc+name+'_cm.png')\n",
+    "    plt.show()\n",
+    "    #plt.close()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 11,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "def read_class_pairs(pair, dataset, cc):#loc='', title=''):\n",
@@ -255,6 +262,7 @@
     "    nobj_truth = np.shape(truth_values)[0]\n",
     "    nclass_truth = np.shape(truth_values)[1]\n",
     "    tvec = np.where(truth_values==1)[1]\n",
+    "    print(tvec)\n",
     "#     if nclass_truth!= nclass:\n",
     "#         print('Truth table of size %i x %i and prob matrix of size %i x %i do not match up in size'%(nobj,nclass,nobj_truth,nclass_truth))\n",
     "#     else:\n",
@@ -266,8 +274,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 12,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [],
    "source": [
     "def make_patch_spines_invisible(ax):\n",
@@ -316,6 +326,7 @@
     "    plt.legend(handles, metric_names)\n",
     "    plt.suptitle(title)\n",
     "    plt.savefig(fileloc)\n",
+    "    plt.show()\n",
     "    return"
    ]
   },
@@ -328,13 +339,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 13,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "'class_pairs'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-13-15f56f172c88>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mdataset\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mmystery\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msnphotcc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mplasticc\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m     \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mempty\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmetricslist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'names'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m     \u001b[0;32mfor\u001b[0m \u001b[0mcc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpair\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'class_pairs'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpair\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0mprobm\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtruthv\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mread_class_pairs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpair\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m#loc=dataset['dirname'], title=dataset['label']+' '+dataset['names'][cc])\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mKeyError\u001b[0m: 'class_pairs'"
+     ]
+    }
+   ],
    "source": [
     "for dataset in [mystery, snphotcc, plasticc]:\n",
     "    data = np.empty((len(metricslist), len(dataset['names'])))\n",
     "    for cc, pair in enumerate(dataset['class_pairs']):\n",
+    "        print(pair)\n",
     "        probm, truthv = read_class_pairs(pair, dataset, cc)#loc=dataset['dirname'], title=dataset['label']+' '+dataset['names'][cc])\n",
     "        for count, metric in enumerate(metricslist):\n",
     "            D = getattr(proclam.metrics, metric)()\n",
@@ -344,45 +370,6 @@
     "    metric_plot(dataset, metricslist, markerlist, colors)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# more_data = np.empty((len(metricslist), len(more_names)))\n",
-    "# for cc, pair in enumerate(more_class_pairs):\n",
-    "#     probm, truthv = read_class_pairs(pair, dirname)\n",
-    "#     for count, metric in enumerate(metricslist):\n",
-    "#         D = getattr(proclam.metrics, metric)()\n",
-    "#         hm = D.evaluate(probm, truthv)\n",
-    "#         more_data[count][cc] = hm"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# metric_plot(more_names, metricslist, more_data, markerlist, colors, title='SNPhotCC', fileloc=dirname+'snphotccdata.png')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# data = np.empty((len(metricslist), len(names)))\n",
-    "# for cc, pair in enumerate(class_pairs):\n",
-    "#     probm, truthv = read_class_pairs(pair, dirname)\n",
-    "#     for count, metric in enumerate(metricslist):\n",
-    "#         D = getattr(proclam.metrics, metric)()\n",
-    "#         hm = D.evaluate(probm, truthv)\n",
-    "#         data[count][cc] = hm"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -390,9 +377,7 @@
     "scrolled": true
    },
    "outputs": [],
-   "source": [
-    "# metric_plot(names, metricslist, data, markerlist, colors, title='Mystery Dataset', fileloc=dirname+'mysterydata.png')"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
@@ -419,7 +404,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.6.8"
   }
  },
  "nbformat": 4,
diff --git a/paper/kaggle-run.ipynb b/paper/kaggle-run.ipynb
index 5f32d29..f8ce066 100644
--- a/paper/kaggle-run.ipynb
+++ b/paper/kaggle-run.ipynb
@@ -175,12 +175,12 @@
     {
      "data": {
       "text/plain": [
-       "{'KNeighbors': ['KNeighbors/predicted_prob_KNeighbors.csv',\n",
+       "{'RandomForest': ['RandomForest/predicted_prob_RandomForest.csv',\n",
+       "  'RandomForest/truth_table_RandomForest.csv'],\n",
+       " 'KNeighbors': ['KNeighbors/predicted_prob_KNeighbors.csv',\n",
        "  'KNeighbors/truth_table_KNeighbors.csv'],\n",
        " 'MLPNeuralNet': ['MLPNeuralNet/predicted_prob_MLPNeuralNet.csv',\n",
-       "  'MLPNeuralNet/truth_table_MLPNeuralNet.csv'],\n",
-       " 'RandomForest': ['RandomForest/predicted_prob_RandomForest.csv',\n",
-       "  'RandomForest/truth_table_RandomForest.csv']}"
+       "  'MLPNeuralNet/truth_table_MLPNeuralNet.csv']}"
       ]
      },
      "execution_count": 5,
@@ -212,7 +212,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -622,9 +622,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "KNeighbors with weights [0.05882353 0.11764706 0.05882353 0.05882353 0.05882353 0.05882353\n",
+      " 0.05882353 0.05882353 0.05882353 0.05882353 0.11764706 0.11764706\n",
+      " 0.11764706] has LogLoss = 20.749255306361132\n"
+     ]
+    }
+   ],
    "source": [
     "# This is how you run the metric with a random weight vector.\n",
     "\n",
@@ -703,7 +713,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.4"
+   "version": "3.6.8"
   }
  },
  "nbformat": 4,
diff --git a/paper/main_paperfigs.ipynb b/paper/main_paperfigs.ipynb
index 476ae20..75f2287 100644
--- a/paper/main_paperfigs.ipynb
+++ b/paper/main_paperfigs.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {
     "tags": [
      "hideme"
@@ -18,7 +18,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "tags": [
      "hideme"
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -121,14 +121,33 @@
   {
    "cell_type": "code",
    "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
    "metadata": {
     "tags": [
      "hideme"
     ]
    },
-   "outputs": [],
-   "source": [
-    "import proclam\n",
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'proclam'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-5-1a61ddb3b271>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m#import proclam\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mproclam\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'proclam'"
+     ]
+    }
+   ],
+   "source": [
+    "#import proclam\n",
     "from proclam import *"
    ]
   },
@@ -1913,9 +1932,9 @@
   "anaconda-cloud": {},
   "celltoolbar": "Tags",
   "kernelspec": {
-   "display_name": "ProClaM (Python 3)",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "proclam_3"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1927,7 +1946,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.6.8"
   }
  },
  "nbformat": 4,

From 308f1975fe95f68d3fcf7761d83559e40d42698b Mon Sep 17 00:00:00 2001
From: Renee Hlozek <renee.hlozek@gmail.com>
Date: Tue, 23 Apr 2019 16:50:39 -0400
Subject: [PATCH 2/9] indentation util

---
 proclam/metrics/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py
index db2e5e9..60cc054 100644
--- a/proclam/metrics/util.py
+++ b/proclam/metrics/util.py
@@ -398,6 +398,6 @@ def auc(x, y):
 	y = np.concatenate(([0.], y, [1.]),)
 
 	i = np.argsort(x)
-    auc = trapz(y[i], x[i])
+        auc = trapz(y[i], x[i])
 
 	return auc

From 7c18574eccf1870e7e70f0d2d5879a6863026435 Mon Sep 17 00:00:00 2001
From: Renee Hlozek <renee.hlozek@gmail.com>
Date: Tue, 23 Apr 2019 16:53:44 -0400
Subject: [PATCH 3/9] indendation

---
 proclam/metrics/util.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py
index 60cc054..9ed65e5 100644
--- a/proclam/metrics/util.py
+++ b/proclam/metrics/util.py
@@ -381,23 +381,23 @@ def recall(classifications,truth,class_idx):
 def auc(x, y):
 	"""
 	Computes the area under curve (just a wrapper for trapezoid rule)
-
-    Parameters
-    ----------
-    x: numpy.ndarray, int or float
-
-    y: numpy.ndarray, int or float
-
-    Returns
-    -------
-    rates: named tuple, float
+        
+        Parameters
+        ----------
+        x: numpy.ndarray, int or float
+        
+        y: numpy.ndarray, int or float
+        
+        Returns
+        -------
+        rates: named tuple, float
         RateMatrix named tuple
-	"""
-
+        """
+        
 	x = np.concatenate(([0.], x, [1.]),)
 	y = np.concatenate(([0.], y, [1.]),)
-
+        
 	i = np.argsort(x)
         auc = trapz(y[i], x[i])
-
+        
 	return auc

From 86ed0df3e254f634cb3bb5f199caf7d515f6b04f Mon Sep 17 00:00:00 2001
From: Renee Hlozek <renee.hlozek@gmail.com>
Date: Tue, 23 Apr 2019 16:55:58 -0400
Subject: [PATCH 4/9] indentation util

---
 proclam/metrics/util.py | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py
index 9ed65e5..d291dd8 100644
--- a/proclam/metrics/util.py
+++ b/proclam/metrics/util.py
@@ -379,25 +379,25 @@ def recall(classifications,truth,class_idx):
 	return tp/(tp+fn)
 
 def auc(x, y):
-	"""
-	Computes the area under curve (just a wrapper for trapezoid rule)
-        
-        Parameters
-        ----------
-        x: numpy.ndarray, int or float
-        
-        y: numpy.ndarray, int or float
-        
-        Returns
-        -------
-        rates: named tuple, float
-        RateMatrix named tuple
-        """
-        
-	x = np.concatenate(([0.], x, [1.]),)
-	y = np.concatenate(([0.], y, [1.]),)
+    """
+    Computes the area under curve (just a wrapper for trapezoid rule)
+    
+    Parameters
+    ----------
+    x: numpy.ndarray, int or float
         
-	i = np.argsort(x)
-        auc = trapz(y[i], x[i])
+    y: numpy.ndarray, int or float
+    
+    Returns
+    -------
+    rates: named tuple, float
+    RateMatrix named tuple
+    """
+    
+    x = np.concatenate(([0.], x, [1.]),)
+    y = np.concatenate(([0.], y, [1.]),)
+    
+    i = np.argsort(x)
+    auc = trapz(y[i], x[i])
         
-	return auc
+    return auc

From d8fbcf5f746c3beef59dc3ebd37adad38f5a5b21 Mon Sep 17 00:00:00 2001
From: Renee Hlozek <reneehlozek@ada.dunlap.utoronto.ca>
Date: Tue, 21 May 2019 16:35:22 -0400
Subject: [PATCH 5/9] metrics code to plot for plasticc science submission

---
 proclam/metrics/util.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py
index d291dd8..1f356c3 100644
--- a/proclam/metrics/util.py
+++ b/proclam/metrics/util.py
@@ -141,10 +141,15 @@ def det_to_cm(dets, truth, per_class_norm=True, vb=False):
     -----
     I need to fix the norm keyword all around to enable more options, like normed output vs. not.
     """
+
+    print(truth)
     pred_classes, pred_counts = np.unique(dets, return_counts=True)
     true_classes, true_counts = np.unique(truth, return_counts=True)
+
+
     if vb: print('by request '+str((pred_classes, pred_counts), (true_classes, true_counts)))
 
+    print(pred_classes, true_classes, 'huh')
     M = np.int(max(max(pred_classes), max(true_classes)) + 1)
 
     if vb: print('by request '+str((np.shape(dets), np.shape(truth)), M))
@@ -193,6 +198,7 @@ def prob_to_cm(probs, truth, per_class_norm=True, vb=False):
     """
     dets = prob_to_det(probs)
 
+    print(truth, 'huh 1')
     cm = det_to_cm(dets, truth, per_class_norm=per_class_norm, vb=vb)
 
     return cm

From e6e46ca8377e24d1314977a571a9afc4db1df9e6 Mon Sep 17 00:00:00 2001
From: Renee Hlozek <reneehlozek@ada.dunlap.utoronto.ca>
Date: Fri, 7 Jun 2019 09:44:30 -0400
Subject: [PATCH 6/9] adjusted weight in brier and logloss

---
 proclam/metrics/brier.py   |  8 ++++++--
 proclam/metrics/logloss.py | 12 +++++++++---
 proclam/metrics/util.py    | 18 +++++++++++-------
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/proclam/metrics/brier.py b/proclam/metrics/brier.py
index b06f923..f7b8f6b 100644
--- a/proclam/metrics/brier.py
+++ b/proclam/metrics/brier.py
@@ -27,7 +27,7 @@ def __init__(self, scheme=None):
 
         super(Brier, self).__init__(scheme)
 
-    def evaluate(self, prediction, truth, averaging='per_class'):
+    def evaluate(self, prediction, truth, weightvector, averaging='per_class'):
         """
         Evaluates the Brier score
 
@@ -37,6 +37,8 @@ def evaluate(self, prediction, truth, averaging='per_class'):
             predicted class probabilities
         truth: numpy.ndarray, int
             true classes
+        wieghtvec: numpy.ndarray, int
+            relative weights
         averaging: string, optional
             'per_class' weights classes equally, other keywords possible
             vector assumed to be class weights
@@ -60,7 +62,9 @@ def evaluate(self, prediction, truth, averaging='per_class'):
         q_each = (prediction - truth_mask) ** 2
 
         class_brier = averager(q_each, truth, M)
-        metric = weight_sum(class_brier, weight_vector=weights)
+        weight_vector=weights*weightvector
+
+        metric = weight_sum(class_brier, weight_vector)
 
         assert(~np.isnan(metric))
 
diff --git a/proclam/metrics/logloss.py b/proclam/metrics/logloss.py
index a3fdc29..bed540b 100644
--- a/proclam/metrics/logloss.py
+++ b/proclam/metrics/logloss.py
@@ -30,7 +30,7 @@ def __init__(self, scheme=None):
         super(LogLoss, self).__init__(scheme)
         self.scheme = scheme
 
-    def evaluate(self, prediction, truth, averaging='per_class'):
+    def evaluate(self, prediction, truth, weightvector, averaging='per_class'):
         """
         Evaluates the log-loss
 
@@ -40,6 +40,8 @@ def evaluate(self, prediction, truth, averaging='per_class'):
             predicted class probabilities
         truth: numpy.ndarray, int
             true classes
+        weightvector: numpy.ndarray, float
+            per class weights
         averaging: string or numpy.ndarray, float
             'per_class' weights classes equally, other keywords possible
             vector assumed to be class weights
@@ -53,11 +55,14 @@ def evaluate(self, prediction, truth, averaging='per_class'):
         -----
         This uses the natural log.
         """
+        print(weightvector, 'checking')
+
         prediction, truth = np.asarray(prediction), np.asarray(truth)
         prediction_shape = np.shape(prediction)
         (N, M) = prediction_shape
 
         weights = check_weights(averaging, M, truth=truth)
+        print('average weights', weights)
         truth_mask = truth_reformatter(truth, prediction)
 
         prediction = sanitize_predictions(prediction)
@@ -67,8 +72,9 @@ def evaluate(self, prediction, truth, averaging='per_class'):
 
         # use a better structure for checking keyword support
         class_logloss = averager(logloss_each, truth, M)
-
-        logloss = weight_sum(class_logloss, weight_vector=weights)
+        weight_vector = weights*weightvector
+        print('ok ready to go', weight_vector)
+        logloss = weight_sum(class_logloss, weight_vector=weight_vector) #=weights)
 
         assert(~np.isnan(logloss))
 
diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py
index 1f356c3..37a0fa7 100644
--- a/proclam/metrics/util.py
+++ b/proclam/metrics/util.py
@@ -142,29 +142,32 @@ def det_to_cm(dets, truth, per_class_norm=True, vb=False):
     I need to fix the norm keyword all around to enable more options, like normed output vs. not.
     """
 
-    print(truth)
     pred_classes, pred_counts = np.unique(dets, return_counts=True)
     true_classes, true_counts = np.unique(truth, return_counts=True)
 
-
     if vb: print('by request '+str((pred_classes, pred_counts), (true_classes, true_counts)))
 
-    print(pred_classes, true_classes, 'huh')
+#    print(pred_classes, true_classes, 'huh')
     M = np.int(max(max(pred_classes), max(true_classes)) + 1)
 
     if vb: print('by request '+str((np.shape(dets), np.shape(truth)), M))
     cm = np.zeros((M, M), dtype=float)
 
     coords = np.array(list(zip(dets, truth)))
+#    print(coords, 'huzzah')
+    #print(np.shape(coords), 'shape coords')
     indices, index_counts = np.unique(coords, axis=0, return_counts=True)
     index_counts = index_counts.astype(int)
+    #print(index_counts, 'index_counts')
+    #print(indices, 'indices')
     if vb: print('by request '+str(index_counts))
     # if vb: print(indices, index_counts)
     indices = indices.T
     # if vb: print(np.shape(indices))
+    #print(cm, 'yo')
     cm[indices[0], indices[1]] = index_counts
-    # if vb: print(cm)
-
+    #if vb: print(cm)
+    #print(cm, 'hi')
     if per_class_norm:
         # print(type(cm))
         # print(type(true_counts))
@@ -198,7 +201,7 @@ def prob_to_cm(probs, truth, per_class_norm=True, vb=False):
     """
     dets = prob_to_det(probs)
 
-    print(truth, 'huh 1')
+#    print(truth, 'huh 1')
     cm = det_to_cm(dets, truth, per_class_norm=per_class_norm, vb=vb)
 
     return cm
@@ -268,7 +271,7 @@ def weight_sum(per_class_metrics, weight_vector, norm=True):
     ----------
     per_class_metrics: numpy.float
         the scores separated by class (a list of arrays)
-    weight_vector: numpy.ndarray floar
+    weight_vector: numpy.ndarray float
         The array of weights per class
     norm: boolean, optional
 
@@ -279,6 +282,7 @@ def weight_sum(per_class_metrics, weight_vector, norm=True):
     """
     weight_sum = np.dot(weight_vector, per_class_metrics)
 
+    #print(weight_sum, 'weight_sum')
     return weight_sum
 
 def check_weights(avg_info, M, chosen=None, truth=None):

From 0d5c63bd30f4946af88911f8037d765f5abb2cd8 Mon Sep 17 00:00:00 2001
From: reneehlozek <reneehlozek@ada.dunlap.utoronto.ca>
Date: Wed, 11 Sep 2019 09:32:51 -0400
Subject: [PATCH 7/9] added metrics plots for results

---
 metrics_plots.py        | 207 ++++++++++++++++++++++++++++++++++++++++
 proclam/metrics/util.py |   2 +
 2 files changed, 209 insertions(+)
 create mode 100644 metrics_plots.py

diff --git a/metrics_plots.py b/metrics_plots.py
new file mode 100644
index 0000000..be1d7be
--- /dev/null
+++ b/metrics_plots.py
@@ -0,0 +1,207 @@
+# import string
+# import itertools
+# import random
+# import os
+# import csv
+
+import numpy as np
+import pandas as pd
+
+import proclam
+from proclam import *
+import matplotlib as mpl
+import pylab as pl
+mpl.use('Agg')
+mpl.rcParams['text.usetex'] = False
+mpl.rcParams['mathtext.rm'] = 'serif'
+mpl.rcParams['font.family'] = 'serif'
+mpl.rcParams['font.serif'] = 'Times New Roman'
+mpl.rcParams['axes.titlesize'] = 16
+mpl.rcParams['axes.labelsize'] = 14
+mpl.rcParams['savefig.dpi'] = 250
+mpl.rcParams['savefig.format'] = 'pdf'
+mpl.rcParams['savefig.bbox'] = 'tight'
+import matplotlib.pyplot as plt
+metricslist = ['Brier', 'LogLoss']
+colors = ['teal', 'magenta']
+dirname = 'examples/'
+markerlist = ['d', 'o', 's', '*']
+plasticc = {}
+plasticc['label'] = 'plasticc'
+#plasticc['names'] = ['Submission_alpha_0.5_190516_1756', 'submission_40_avocado', 'submission_probe99_40_avocado'] 
+plasticc['names'] = ['3_MajorTom'] #'2_MikeSilogram' ] #'1_Kyle']
+
+#, '2_MikeSilogram', '3_MajorTom']
+
+
+
+list = [6,15,16,42,52,53,62,64,65,67,88,90,92,95,99]
+itemlist=['uLens-Point', 'TDE', 'EBE', 'SNCC-II', 'MIRA', 'SNCC-Ibc', 'KN', 'Mdwarf', 'SNIa-91bg', 'AGN', 'SNIa-normal', 'RRlyrae', 'SLSN-I', 'Other']
+
+choices=['All']
+
+#, 'uLens-Point', 'TDE', 'EBE', 'SNCC-II', 'MIRA', 'SNCC-Ibc', 'KN', 'Mdwarf', 'SNIa-91bg', 'AGN', 'SNIa-normal', 'RRlyrae', 'SLSN-I', 'Other']
+
+# 90    SNIa-normal
+#62       SNCC-Ibc
+#42        SNCC-II
+#67      SNIa-91bg
+#52         SNIa-x
+#64             KN
+#95         SLSN-I
+#99          Other
+#15            TDE
+#88            AGN
+#92        RRlyrae
+#65         Mdwarf
+#16            EBE
+#53           MIRA
+#6    uLens-Point
+
+
+#Idealized', 'Guess', 'Tunnel', 'Broadbrush', 'Cruise', 'SubsumedTo', 'SubsumedFrom']
+def make_class_pairs(data_info_dict):
+    return zip(data_info_dict['classifications'], data_info_dict['truth_tables'])
+
+def make_file_locs(data_info_dict):
+    names = data_info_dict['names']
+    data_info_dict['dirname'] = dirname + data_info_dict['label'] + '/'
+#    data_info_dict['classifications'] = ['%s/predicted_prob_%s.csv'%(name, name) for name in names]
+#   data_info_dict['truth_tables'] = ['%s/truth_table_%s.csv'%(name, name) for name in names]
+    data_info_dict['classifications'] = ['%s/%s.csv'%(name, name) for name in names]
+    data_info_dict['truth_tables'] = ['%s/%s_truth.csv'%(name, name) for name in names]
+    print(data_info_dict)
+    return data_info_dict
+
+def plot_cm(probs, truth, name, loc=''):
+    print(np.shape(probs), np.shape(truth), 'checking sizes of probs and truth')
+    cm = proclam.metrics.util.prob_to_cm(probs, truth)
+    pl.clf()
+    plt.matshow(cm.T, vmin=0., vmax=1.)
+# plt.xticks(range(max(truth)+1), names)
+# plt.yticks(range(max(truth)+1), names)
+    plt.xlabel('predicted class')
+    plt.ylabel('true class')
+    plt.colorbar()
+    plt.title(name)
+    plt.savefig(loc+name+'_cm.png')
+    #plt.show()
+    #plt.close()
+
+
+
+def read_class_pairs(pair, dataset, cc):#loc='', title=''):
+    loc=dataset['dirname']
+    title=dataset['label']+' '+ dataset['names'][cc]
+    clfile = pair[0]
+    truthfile = pair[1]
+    print(clfile, truthfile)
+    prob_mat = pd.read_csv(loc+clfile)#, delim_whitespace=True)
+    nobj = np.shape(prob_mat)[0]
+    nclass = np.shape(prob_mat)[1]-1 #since they have object ID as an element
+    cols=prob_mat.columns.tolist()
+    
+    objid = prob_mat[cols[0]]
+    pmat=np.array(prob_mat[cols[1:]])
+    
+    truth_values = pd.read_csv(loc+truthfile) #, delim_whitespace=True)
+    nobj_truth = np.shape(truth_values)[0]
+    nclass_truth = np.shape(truth_values)[1]-1
+    truvals = np.array(truth_values[cols[1:]])
+    tvec = np.where(truvals==1)[1]
+
+    
+#    pmat = prob_mat[:,1:]
+    plot_cm(pmat, tvec, title, loc=loc+dataset['names'][cc]+'/')
+    return pmat, tvec
+
+
+def make_patch_spines_invisible(ax):
+    ax.set_frame_on(True)
+    ax.patch.set_visible(False)
+    for sp in ax.spines.values():
+        sp.set_visible(False)
+        
+def per_metric_helper(ax, n, data, metric_names, codes, shapes, colors):
+    plot_n = n+1
+    in_x = np.arange(len(codes))
+    ax_n = ax
+    n_factor = 0.1 * (plot_n - 2)
+    if plot_n>1:
+        ax_n = ax.twinx()
+        rot_ang = 270
+        label_space = 15.
+    else:
+        rot_ang = 90
+        label_space = 0.
+    if plot_n>2:
+        ax_n.spines["right"].set_position(("axes", 1. + 0.1 * (plot_n-1)))
+        make_patch_spines_invisible(ax_n)
+        ax_n.spines["right"].set_visible(True)
+    handle = ax_n.scatter(in_x+n_factor*np.ones_like(data[n]), data[n], marker=shapes[n], s=10, color=colors[n], label=metric_names[n])
+    ax_n.set_ylabel(metric_names[n], rotation=rot_ang, fontsize=14, labelpad=label_space)
+#     ax_n.set_ylim(0.9 * min(data[n]), 1.1 * max(data[n]))
+    return(ax, ax_n, handle)
+
+def metric_plot(dataset, metric_names, shapes, colors, choice):
+    codes = dataset['names']
+    data = dataset['results']
+    title = dataset['label']+' results focusing on: '+str(choice)
+    fileloc = dataset['dirname']+dataset['label']+'_'+str(choice)+'_results.png'
+    xs = np.arange(len(codes))
+    pl.clf()
+    fig, ax = plt.subplots()
+    fig.subplots_adjust(right=1.)
+    handles = []
+    for n in range(len(metric_names)):
+        (ax, ax_n, handle) = per_metric_helper(ax, n, data, metric_names, codes, shapes, colors)
+        handles.append(handle)
+    plt.xticks(xs, codes)
+    for tick in ax.get_xticklabels():
+        tick.set_rotation(90)
+    plt.xlabel('Classifiers', fontsize=14)
+    leg=plt.legend(handles, metric_names, numpoints=1, loc='lower right')
+    #leg.draw_frame(False)
+    plt.suptitle(title)
+    plt.savefig(fileloc)
+    #plt.show()
+    return
+
+
+for dataset in [ plasticc]: #mystery, snphotcc,
+    dataset = make_file_locs(dataset)
+    dataset['class_pairs'] = make_class_pairs(dataset)
+
+
+for choice in choices:
+    pl.clf()
+    
+    if choice=='All':
+        print('ignoring weights for %s'%choice)
+        #weights= None
+        weights=np.ones(len(list))
+        print(len(list), 'length of list')
+    else:
+        weights= np.zeros(len(list)) #1e-5*np.ones(len(list))
+        ind = itemlist.index(choice) 
+        print(itemlist)
+        print(ind, 'check ind', choice)
+        print(itemlist[ind], choice, 'checking choice')
+        weights[ind]=1.0
+    for dataset in [plasticc]:
+        data = np.empty((len(metricslist), len(dataset['names'])))
+        
+        for cc, pair in enumerate(dataset['class_pairs']):
+            probm, truthv = read_class_pairs(pair, dataset, cc)
+
+            for count, metric in enumerate(metricslist):
+                print(weights, 'checking huh')
+                print(len(weights), 'how many weights?')
+                D = getattr(proclam.metrics, metric, weights)()
+                hm = D.evaluate(probm, truthv, weights)
+                data[count][cc] = hm
+        dataset['results'] = data
+
+        metric_plot(dataset, metricslist, markerlist, colors, choice)
+
+#-----------------------------------------------------
diff --git a/proclam/metrics/util.py b/proclam/metrics/util.py
index 37a0fa7..cc55d26 100644
--- a/proclam/metrics/util.py
+++ b/proclam/metrics/util.py
@@ -168,9 +168,11 @@ def det_to_cm(dets, truth, per_class_norm=True, vb=False):
     cm[indices[0], indices[1]] = index_counts
     #if vb: print(cm)
     #print(cm, 'hi')
+
     if per_class_norm:
         # print(type(cm))
         # print(type(true_counts))
+        print(np.shape(cm), np.shape(true_counts), 'shapes')
         # cm = cm / true_counts
         # cm /= true_counts[:, np.newaxis] #
         cm = cm / true_counts[np.newaxis, :]

From 82d48d3936909c5f89e84dc039b60c83bec06134 Mon Sep 17 00:00:00 2001
From: Renee Hlozek <renee.hlozek@gmail.com>
Date: Wed, 4 Dec 2019 08:51:36 -0500
Subject: [PATCH 8/9] code to adjust csv files

---
 organize_csv.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 organize_csv.py

diff --git a/organize_csv.py b/organize_csv.py
new file mode 100644
index 0000000..c58f6d0
--- /dev/null
+++ b/organize_csv.py
@@ -0,0 +1,48 @@
+import pylab as pl
+import pandas as pd
+import numpy as np
+
+list = ['2_MikeSilogram', '3_MajorTom']
+
+kyle = pd.read_csv('examples/plasticc/1_Kyle/1_Kyle.csv')
+kylecols = kyle.columns.tolist()
+
+
+matstruc = kyle.copy()
+matstrucdat = np.array(matstruc)
+indexlist = np.zeros(len(kylecols))
+print(kylecols)
+print(matstrucdat[0,:])
+
+
+truth = pd.read_csv('1_Kyle/1_Kyle_truth.csv')
+
+for file in list:
+    name = file+'/'+file+'.csv'
+    print(name)
+    mat = pd.read_csv(name)
+    matdat = np.array(mat)
+
+    cols = mat.columns.tolist()
+    index = mat.index
+    #print(np.shape(mat))
+    print(cols)
+    print(matdat[0,:], 'test before')
+    
+
+    for i in range(len(kylecols)):
+#        df[col] = df[col].replace(findL, replaceL)
+        for j in range(len(cols)):
+            if cols[j]==kylecols[i]:
+                indexlist[i]=j
+                matstrucdat[:,i] = matdat[:,j]
+                print(i, j, cols[j], kylecols[i])
+
+                matstruc.rename(columns={"A": "a", "B": "c"})
+    print(matstrucdat[0,:], 'test after')
+
+    newname = file+'/'+file+'_reordered.csv'
+    newnametruth = file+'/'+file+'_reordered_truth.csv'
+
+    matstruc.to_csv(newname, index=False)
+    truth.to_csv(newnametruth)

From 6f2597f43b723ebba2e2ac247f45346aff0c6c87 Mon Sep 17 00:00:00 2001
From: Renee Hlozek <renee.hlozek@gmail.com>
Date: Wed, 4 Dec 2019 09:51:52 -0500
Subject: [PATCH 9/9] updated paths

---
 organize_csv.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/organize_csv.py b/organize_csv.py
index c58f6d0..a98a4f8 100644
--- a/organize_csv.py
+++ b/organize_csv.py
@@ -1,7 +1,7 @@
 import pylab as pl
 import pandas as pd
 import numpy as np
-
+dir = 'examples/plasticc/'
 list = ['2_MikeSilogram', '3_MajorTom']
 
 kyle = pd.read_csv('examples/plasticc/1_Kyle/1_Kyle.csv')
@@ -15,10 +15,10 @@
 print(matstrucdat[0,:])
 
 
-truth = pd.read_csv('1_Kyle/1_Kyle_truth.csv')
+truth = pd.read_csv('examples/plasticc/1_Kyle/1_Kyle_truth.csv')
 
 for file in list:
-    name = file+'/'+file+'.csv'
+    name = dir+file+'/'+file+'.csv'
     print(name)
     mat = pd.read_csv(name)
     matdat = np.array(mat)
@@ -41,8 +41,8 @@
                 matstruc.rename(columns={"A": "a", "B": "c"})
     print(matstrucdat[0,:], 'test after')
 
-    newname = file+'/'+file+'_reordered.csv'
-    newnametruth = file+'/'+file+'_reordered_truth.csv'
+    newname = dir+file+'/'+file+'_reordered.csv'
+    newnametruth = dir+file+'/'+file+'_reordered_truth.csv'
 
     matstruc.to_csv(newname, index=False)
     truth.to_csv(newnametruth)