neurodata · SamuelCarliles3 · Nov 22, 2024 · Dec 9, 2024 · Dec 10, 2024 · Dec 11, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -2,3 +2,7 @@
 	path = treeple/_lib/sklearn_fork
 	url = https://github.com/neurodata/scikit-learn
 	branch = submodulev3
+[submodule "treeple/_lib_experimental/sklearn_fork"]
+	path = treeple/_lib_experimental/sklearn_fork
+	url = https://github.com/neurodata/scikit-learn.git
+	branch = scarliles/honesty
diff --git a/examples/viss/calibration.ipynb b/examples/viss/calibration.ipynb
diff --git a/examples/viss/test-decision-tree-zero-weights.ipynb b/examples/viss/test-decision-tree-zero-weights.ipynb
@@ -0,0 +1,125 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "224cc887-64de-49da-a04b-7d478e1fa0f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The Platonic Ideal: Verify _empirically_ that\n",
+    "# - train, test, oob mutually disjunct\n",
+    "# - train U test U oob = entire sample\n",
+    "# - all oob observations get a leaf assignment\n",
+    "# - all observations within leaf cell bounds\n",
+    "# - any way to verify optimal splits subject to constraints?\n",
+    "#\n",
+    "# The Capitulation to Reality:\n",
+    "# quite a bit of shenanigans to work around the fact that the base\n",
+    "# DecisionTreeClassifier does not retain training indices in the nodes,\n",
+    "# and therefore node membership by index cannot be verified post hoc\n",
+    "#\n",
+    "# instead we settle for the following procedure\n",
+    "# - eliminate randomness\n",
+    "# - train on untampered data to identify purported honest, structure, and oob\n",
+    "#   sample indices\n",
+    "# - shuffle y values among honest samples. if y altered y values are considered\n",
+    "#   (thereby violating honesty), the splits should change\n",
+    "# - train again from scratch on data with altered honest set\n",
+    "# - verify that splits remain the same\n",
+    "# - we only test unstratified sampling here so that we can shuffle the honest y values\n",
+    "# - we test stratified sampling at the forest level"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "384dbe6e-3300-46c5-bfae-4f86c865b3df",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Done.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "from treeple.datasets import make_trunk_classification\n",
+    "\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "\n",
+    "\n",
+    "N_ITER = 100\n",
+    "SAMPLE_SIZE = 256\n",
+    "RANDOM_STATE = 1\n",
+    "\n",
+    "X, y = make_trunk_classification(\n",
+    "    n_samples=SAMPLE_SIZE,\n",
+    "    n_dim=1,\n",
+    "    n_informative=1,\n",
+    "    seed=0,\n",
+    ")\n",
+    "X_t = np.concatenate((\n",
+    "    X[: SAMPLE_SIZE // 2],\n",
+    "    X[SAMPLE_SIZE // 2 :]\n",
+    "))\n",
+    "y_t = np.concatenate((np.zeros(SAMPLE_SIZE // 2), np.ones(SAMPLE_SIZE // 2)))\n",
+    "all_indices = [i for i in range(SAMPLE_SIZE)]\n",
+    "structure_indices = [i for i in range(SAMPLE_SIZE) if i % 2 == 0]\n",
+    "honest_indices = np.setdiff1d(all_indices, structure_indices)\n",
+    "w = np.ones(SAMPLE_SIZE)\n",
+    "w[honest_indices] = 0\n",
+    "\n",
+    "tree = DecisionTreeClassifier(random_state=RANDOM_STATE)\n",
+    "y_perm = y_t.ravel().copy()\n",
+    "tree.fit(X_t, y_perm, sample_weight=w)\n",
+    "old_threshold = tree.tree_.threshold.copy()\n",
+    "old_y = y_perm.copy()\n",
+    "\n",
+    "for it in range(N_ITER):\n",
+    "    tree = DecisionTreeClassifier(random_state=RANDOM_STATE)\n",
+    "    y_perm = y_t.ravel().copy()\n",
+    "    honest_shuffled = honest_indices.copy()\n",
+    "    np.random.shuffle(honest_shuffled)\n",
+    "\n",
+    "    for i in range(len(honest_indices)):\n",
+    "        y_perm[honest_indices[i]] = y_t[honest_shuffled[i]]\n",
+    "\n",
+    "    # print(f\"y_perm = {y_perm}\")\n",
+    "    assert(not np.array_equal(y_t, y_perm))\n",
+    "    assert(not np.array_equal(old_y, y_perm))\n",
+    "\n",
+    "    tree.fit(X_t, y_perm, sample_weight=w)\n",
+    "    assert(np.array_equal(old_threshold, tree.tree_.threshold))\n",
+    "    old_threshold = tree.tree_.threshold.copy()\n",
+    "\n",
+    "print(\"Done.\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/viss/test-honest-forest-alter-X.ipynb b/examples/viss/test-honest-forest-alter-X.ipynb
@@ -0,0 +1,229 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "75420844-e46a-4bb9-9879-fba35e9af2eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The Platonic Ideal: Verify empirically that\n",
+    "# - train, test, oob mutually disjunct\n",
+    "# - train U test U oob = entire sample\n",
+    "# - all oob observations get a leaf assignment\n",
+    "# - all observations within leaf cell bounds\n",
+    "# - any way to verify optimal splits subject to constraints?\n",
+    "#\n",
+    "# The Capitulation to Reality:\n",
+    "# quite a bit of shenanigans to work around the fact that the base\n",
+    "# DecisionTreeClassifier does not retain training indices in the nodes,\n",
+    "# and therefore node membership by index cannot be verified post hoc\n",
+    "#\n",
+    "# instead we settle for the following procedure\n",
+    "# - eliminate randomness\n",
+    "# - train on untampered data to identify purported honest, structure, and oob\n",
+    "#   sample indices\n",
+    "# - alter honest X values such that if they affect splits in any way,\n",
+    "#   the changes should result in different splits\n",
+    "# - verify that the splits remain the same\n",
+    "# - this tests a stronger assumption than the honesty assumption\n",
+    "#   (that honest Y values are not considered) because stratified sampling necessarily\n",
+    "#   considers Y distribution when selecting splits (for honest/structure partitioning),\n",
+    "#   so that we can't get stable partitions across trials if we alter Y values\n",
+    "# - next we alter structure X values similarly\n",
+    "# - verify that the splits change as expected\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "384dbe6e-3300-46c5-bfae-4f86c865b3df",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "done\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "from treeple.datasets import make_trunk_classification\n",
+    "\n",
+    "from sklearn.ensemble import HonestRandomForestClassifier as HonestForestClassifier\n",
+    "#from treeple.ensemble import HonestForestClassifier\n",
+    "\n",
+    "\n",
+    "# in order for this test to work, one must ensure that the honest split rejection\n",
+    "# criteria never veto a desired split by the shadow structure tree.\n",
+    "# the lazy way to do this is to make sure there are enough honest observations\n",
+    "# so that there will be enough on either side of any potential structure split.\n",
+    "# thus more dims => more samples\n",
+    "N_TREES = 1\n",
+    "N_DIM = 10\n",
+    "SAMPLE_SIZE = 2098\n",
+    "RANDOM_STATE = 1\n",
+    "HONEST_FRACTION = 0.95\n",
+    "STRATIFY = True\n",
+    "\n",
+    "X, y = make_trunk_classification(\n",
+    "    n_samples=SAMPLE_SIZE,\n",
+    "    n_dim=N_DIM,\n",
+    "    n_informative=1,\n",
+    "    seed=0,\n",
+    "    mu_0=-5,\n",
+    "    mu_1=5\n",
+    ")\n",
+    "X_t = np.concatenate((\n",
+    "    X[: SAMPLE_SIZE // 2],\n",
+    "    X[SAMPLE_SIZE // 2 :]\n",
+    "))\n",
+    "y_t = np.concatenate((\n",
+    "    y[: SAMPLE_SIZE // 2],\n",
+    "    y[SAMPLE_SIZE // 2 :]\n",
+    "))\n",
+    "\n",
+    "\n",
+    "def perturb(X, y, indices):\n",
+    "    for d in range(N_DIM):\n",
+    "        for i in indices:\n",
+    "            if y[i] == 0 and np.random.randint(0, 2, 1) > 0:\n",
+    "                X[i, d] -= 5\n",
+    "            elif np.random.randint(0, 2, 1) > 0:\n",
+    "                X[i, d] -= 2\n",
+    "\n",
+    "    return X, y\n",
+    "\n",
+    "\n",
+    "class Trial:\n",
+    "    def __init__(self, X, y):\n",
+    "        self.est = HonestForestClassifier(\n",
+    "            n_estimators=N_TREES,\n",
+    "            max_samples=1.0,\n",
+    "            max_features=0.3,\n",
+    "            bootstrap=True,\n",
+    "            stratify=STRATIFY,\n",
+    "            n_jobs=-2,\n",
+    "            random_state=RANDOM_STATE,\n",
+    "            honest_prior=\"ignore\",\n",
+    "            honest_fraction=HONEST_FRACTION,\n",
+    "        )\n",
+    "        self.est.fit(X, y)\n",
+    "        \n",
+    "        self.tree = self.est.estimators_[0]\n",
+    "        self.honest_tree = self.tree.tree_\n",
+    "        self.structure_tree = self.honest_tree.target_tree\n",
+    "        self.honest_indices = np.sort(self.tree.honest_indices_)\n",
+    "        self.structure_indices = np.sort(self.tree.structure_indices_)\n",
+    "        self.threshold = self.honest_tree.target_tree.threshold.copy()\n",
+    "\n",
+    "\n",
+    "trial_results = []\n",
+    "trial_results.append(Trial(X_t, y_t))\n",
+    "\n",
+    "# perturb honest X values; threshold should not change\n",
+    "X_t, y_t = perturb(X_t, y_t, trial_results[0].honest_indices)\n",
+    "\n",
+    "trial_results.append(Trial(X_t, y_t))\n",
+    "assert np.array_equal(\n",
+    "    trial_results[0].honest_indices,\n",
+    "    trial_results[1].honest_indices\n",
+    ")\n",
+    "assert np.array_equal(\n",
+    "    trial_results[0].structure_indices,\n",
+    "    trial_results[1].structure_indices\n",
+    ")\n",
+    "assert np.array_equal(\n",
+    "    trial_results[0].threshold,\n",
+    "    trial_results[1].threshold\n",
+    "), f\"threshold1 = {trial_results[0].threshold}\\nthreshold2 = {trial_results[1].threshold}\"\n",
+    "\n",
+    "\n",
+    "# perturb structure X's; threshold should change\n",
+    "X_t, y_t = perturb(X_t, y_t, trial_results[0].structure_indices)\n",
+    "trial_results.append(Trial(X_t, y_t))\n",
+    "assert np.array_equal(\n",
+    "    trial_results[0].honest_indices,\n",
+    "    trial_results[2].honest_indices\n",
+    ")\n",
+    "assert np.array_equal(\n",
+    "    trial_results[0].structure_indices,\n",
+    "    trial_results[2].structure_indices\n",
+    ")\n",
+    "assert not np.array_equal(\n",
+    "    trial_results[0].threshold,\n",
+    "    trial_results[2].threshold\n",
+    ")\n",
+    "\n",
+    "print(\"done\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e17943e5-2dec-491c-a712-543dd5ddb9fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "done\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import StratifiedShuffleSplit\n",
+    "\n",
+    "# verify elimination of randomness from StratifiedShuffleSplit\n",
+    "ss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=1)\n",
+    "for structure_idx, _ in ss.split(\n",
+    "    np.zeros((20, 1)), [1 if i > 10 else 0 for i in range(20)]\n",
+    "):\n",
+    "    structure_idx1 = structure_idx.copy()\n",
+    "\n",
+    "ss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=1)\n",
+    "for structure_idx, _ in ss.split(\n",
+    "    np.zeros((20, 1)), [1 if i > 10 else 0 for i in range(20)]\n",
+    "):\n",
+    "    structure_idx2 = structure_idx.copy()\n",
+    "\n",
+    "assert np.array_equal(structure_idx1, structure_idx2)\n",
+    "\n",
+    "print(\"done\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "df9b3440-ab64-41d9-904a-8cf26927a9fa",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}