Skip to content

Honesty #338

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,7 @@
path = treeple/_lib/sklearn_fork
url = https://github.com/neurodata/scikit-learn
branch = submodulev3
[submodule "treeple/_lib_experimental/sklearn_fork"]
path = treeple/_lib_experimental/sklearn_fork
url = https://github.com/neurodata/scikit-learn.git
branch = scarliles/honesty
211 changes: 211 additions & 0 deletions examples/viss/calibration.ipynb

Large diffs are not rendered by default.

125 changes: 125 additions & 0 deletions examples/viss/test-decision-tree-zero-weights.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "224cc887-64de-49da-a04b-7d478e1fa0f7",
"metadata": {},
"outputs": [],
"source": [
"# The Platonic Ideal: Verify _empirically_ that\n",
"# - train, test, oob mutually disjunct\n",
"# - train U test U oob = entire sample\n",
"# - all oob observations get a leaf assignment\n",
"# - all observations within leaf cell bounds\n",
"# - any way to verify optimal splits subject to constraints?\n",
"#\n",
"# The Capitulation to Reality:\n",
"# quite a bit of shenanigans to work around the fact that the base\n",
"# DecisionTreeClassifier does not retain training indices in the nodes,\n",
"# and therefore node membership by index cannot be verified post hoc\n",
"#\n",
"# instead we settle for the following procedure\n",
"# - eliminate randomness\n",
"# - train on untampered data to identify purported honest, structure, and oob\n",
"# sample indices\n",
"# - shuffle y values among honest samples. if y altered y values are considered\n",
"# (thereby violating honesty), the splits should change\n",
"# - train again from scratch on data with altered honest set\n",
"# - verify that splits remain the same\n",
"# - we only test unstratified sampling here so that we can shuffle the honest y values\n",
"# - we test stratified sampling at the forest level"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "384dbe6e-3300-46c5-bfae-4f86c865b3df",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Done.\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"from treeple.datasets import make_trunk_classification\n",
"\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"\n",
"N_ITER = 100\n",
"SAMPLE_SIZE = 256\n",
"RANDOM_STATE = 1\n",
"\n",
"X, y = make_trunk_classification(\n",
" n_samples=SAMPLE_SIZE,\n",
" n_dim=1,\n",
" n_informative=1,\n",
" seed=0,\n",
")\n",
"X_t = np.concatenate((\n",
" X[: SAMPLE_SIZE // 2],\n",
" X[SAMPLE_SIZE // 2 :]\n",
"))\n",
"y_t = np.concatenate((np.zeros(SAMPLE_SIZE // 2), np.ones(SAMPLE_SIZE // 2)))\n",
"all_indices = [i for i in range(SAMPLE_SIZE)]\n",
"structure_indices = [i for i in range(SAMPLE_SIZE) if i % 2 == 0]\n",
"honest_indices = np.setdiff1d(all_indices, structure_indices)\n",
"w = np.ones(SAMPLE_SIZE)\n",
"w[honest_indices] = 0\n",
"\n",
"tree = DecisionTreeClassifier(random_state=RANDOM_STATE)\n",
"y_perm = y_t.ravel().copy()\n",
"tree.fit(X_t, y_perm, sample_weight=w)\n",
"old_threshold = tree.tree_.threshold.copy()\n",
"old_y = y_perm.copy()\n",
"\n",
"for it in range(N_ITER):\n",
" tree = DecisionTreeClassifier(random_state=RANDOM_STATE)\n",
" y_perm = y_t.ravel().copy()\n",
" honest_shuffled = honest_indices.copy()\n",
" np.random.shuffle(honest_shuffled)\n",
"\n",
" for i in range(len(honest_indices)):\n",
" y_perm[honest_indices[i]] = y_t[honest_shuffled[i]]\n",
"\n",
" # print(f\"y_perm = {y_perm}\")\n",
" assert(not np.array_equal(y_t, y_perm))\n",
" assert(not np.array_equal(old_y, y_perm))\n",
"\n",
" tree.fit(X_t, y_perm, sample_weight=w)\n",
" assert(np.array_equal(old_threshold, tree.tree_.threshold))\n",
" old_threshold = tree.tree_.threshold.copy()\n",
"\n",
"print(\"Done.\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
229 changes: 229 additions & 0 deletions examples/viss/test-honest-forest-alter-X.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "75420844-e46a-4bb9-9879-fba35e9af2eb",
"metadata": {},
"outputs": [],
"source": [
"# The Platonic Ideal: Verify empirically that\n",
"# - train, test, oob mutually disjunct\n",
"# - train U test U oob = entire sample\n",
"# - all oob observations get a leaf assignment\n",
"# - all observations within leaf cell bounds\n",
"# - any way to verify optimal splits subject to constraints?\n",
"#\n",
"# The Capitulation to Reality:\n",
"# quite a bit of shenanigans to work around the fact that the base\n",
"# DecisionTreeClassifier does not retain training indices in the nodes,\n",
"# and therefore node membership by index cannot be verified post hoc\n",
"#\n",
"# instead we settle for the following procedure\n",
"# - eliminate randomness\n",
"# - train on untampered data to identify purported honest, structure, and oob\n",
"# sample indices\n",
"# - alter honest X values such that if they affect splits in any way,\n",
"# the changes should result in different splits\n",
"# - verify that the splits remain the same\n",
"# - this tests a stronger assumption than the honesty assumption\n",
"# (that honest Y values are not considered) because stratified sampling necessarily\n",
"# considers Y distribution when selecting splits (for honest/structure partitioning),\n",
"# so that we can't get stable partitions across trials if we alter Y values\n",
"# - next we alter structure X values similarly\n",
"# - verify that the splits change as expected\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "384dbe6e-3300-46c5-bfae-4f86c865b3df",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"done\n"
]
}
],
"source": [
"import numpy as np\n",
"\n",
"from treeple.datasets import make_trunk_classification\n",
"\n",
"from sklearn.ensemble import HonestRandomForestClassifier as HonestForestClassifier\n",
"#from treeple.ensemble import HonestForestClassifier\n",
"\n",
"\n",
"# in order for this test to work, one must ensure that the honest split rejection\n",
"# criteria never veto a desired split by the shadow structure tree.\n",
"# the lazy way to do this is to make sure there are enough honest observations\n",
"# so that there will be enough on either side of any potential structure split.\n",
"# thus more dims => more samples\n",
"N_TREES = 1\n",
"N_DIM = 10\n",
"SAMPLE_SIZE = 2098\n",
"RANDOM_STATE = 1\n",
"HONEST_FRACTION = 0.95\n",
"STRATIFY = True\n",
"\n",
"X, y = make_trunk_classification(\n",
" n_samples=SAMPLE_SIZE,\n",
" n_dim=N_DIM,\n",
" n_informative=1,\n",
" seed=0,\n",
" mu_0=-5,\n",
" mu_1=5\n",
")\n",
"X_t = np.concatenate((\n",
" X[: SAMPLE_SIZE // 2],\n",
" X[SAMPLE_SIZE // 2 :]\n",
"))\n",
"y_t = np.concatenate((\n",
" y[: SAMPLE_SIZE // 2],\n",
" y[SAMPLE_SIZE // 2 :]\n",
"))\n",
"\n",
"\n",
"def perturb(X, y, indices):\n",
" for d in range(N_DIM):\n",
" for i in indices:\n",
" if y[i] == 0 and np.random.randint(0, 2, 1) > 0:\n",
" X[i, d] -= 5\n",
" elif np.random.randint(0, 2, 1) > 0:\n",
" X[i, d] -= 2\n",
"\n",
" return X, y\n",
"\n",
"\n",
"class Trial:\n",
" def __init__(self, X, y):\n",
" self.est = HonestForestClassifier(\n",
" n_estimators=N_TREES,\n",
" max_samples=1.0,\n",
" max_features=0.3,\n",
" bootstrap=True,\n",
" stratify=STRATIFY,\n",
" n_jobs=-2,\n",
" random_state=RANDOM_STATE,\n",
" honest_prior=\"ignore\",\n",
" honest_fraction=HONEST_FRACTION,\n",
" )\n",
" self.est.fit(X, y)\n",
" \n",
" self.tree = self.est.estimators_[0]\n",
" self.honest_tree = self.tree.tree_\n",
" self.structure_tree = self.honest_tree.target_tree\n",
" self.honest_indices = np.sort(self.tree.honest_indices_)\n",
" self.structure_indices = np.sort(self.tree.structure_indices_)\n",
" self.threshold = self.honest_tree.target_tree.threshold.copy()\n",
"\n",
"\n",
"trial_results = []\n",
"trial_results.append(Trial(X_t, y_t))\n",
"\n",
"# perturb honest X values; threshold should not change\n",
"X_t, y_t = perturb(X_t, y_t, trial_results[0].honest_indices)\n",
"\n",
"trial_results.append(Trial(X_t, y_t))\n",
"assert np.array_equal(\n",
" trial_results[0].honest_indices,\n",
" trial_results[1].honest_indices\n",
")\n",
"assert np.array_equal(\n",
" trial_results[0].structure_indices,\n",
" trial_results[1].structure_indices\n",
")\n",
"assert np.array_equal(\n",
" trial_results[0].threshold,\n",
" trial_results[1].threshold\n",
"), f\"threshold1 = {trial_results[0].threshold}\\nthreshold2 = {trial_results[1].threshold}\"\n",
"\n",
"\n",
"# perturb structure X's; threshold should change\n",
"X_t, y_t = perturb(X_t, y_t, trial_results[0].structure_indices)\n",
"trial_results.append(Trial(X_t, y_t))\n",
"assert np.array_equal(\n",
" trial_results[0].honest_indices,\n",
" trial_results[2].honest_indices\n",
")\n",
"assert np.array_equal(\n",
" trial_results[0].structure_indices,\n",
" trial_results[2].structure_indices\n",
")\n",
"assert not np.array_equal(\n",
" trial_results[0].threshold,\n",
" trial_results[2].threshold\n",
")\n",
"\n",
"print(\"done\")\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e17943e5-2dec-491c-a712-543dd5ddb9fa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"done\n"
]
}
],
"source": [
"from sklearn.model_selection import StratifiedShuffleSplit\n",
"\n",
"# verify elimination of randomness from StratifiedShuffleSplit\n",
"ss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=1)\n",
"for structure_idx, _ in ss.split(\n",
" np.zeros((20, 1)), [1 if i > 10 else 0 for i in range(20)]\n",
"):\n",
" structure_idx1 = structure_idx.copy()\n",
"\n",
"ss = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=1)\n",
"for structure_idx, _ in ss.split(\n",
" np.zeros((20, 1)), [1 if i > 10 else 0 for i in range(20)]\n",
"):\n",
" structure_idx2 = structure_idx.copy()\n",
"\n",
"assert np.array_equal(structure_idx1, structure_idx2)\n",
"\n",
"print(\"done\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "df9b3440-ab64-41d9-904a-8cf26927a9fa",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading
Loading