diff --git a/.bumpversion.cfg b/.bumpversion.cfg
new file mode 100644
index 0000000..4bcdc38
--- /dev/null
+++ b/.bumpversion.cfg
@@ -0,0 +1,22 @@
+[bumpversion]
+current_version = 0.1.0
+commit = False
+tag = False
+allow_dirty = False
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{build}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = prod
+first_value = dev
+values = 
+	dev
+	prod
+
+[bumpversion:part:build]
+
+[bumpversion:file:setup.py]
+
+[bumpversion:file:src/kyle/__init__.py]
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..c5319bc
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,2 @@
+[run]
+source = src
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
new file mode 100644
index 0000000..fe26c0d
--- /dev/null
+++ b/.github/workflows/release.yaml
@@ -0,0 +1,48 @@
+on:
+  push:
+    # Sequence of patterns matched against refs/tags
+    tags:
+      - 'v*' # Push events to matching v*, i.e. v1.0, v20.15.10
+
+name: Create Release
+
+jobs:
+  build:
+    name: Create GitHub Release
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+      - name: Create Release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
+        with:
+          tag_name: ${{ github.ref }}
+          release_name: Release ${{ github.ref }}
+          body: |
+            Changes in this Release
+            - First Change
+            - Second Change
+          draft: false
+          prerelease: false
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python for PyPI Release
+        uses: actions/setup-python@v1
+        with:
+          python-version: '3.8'
+      - name: Install dependencies for PyPI Release
+        run: |
+          python -m pip install --upgrade pip
+          pip install setuptools wheel twine
+      - name: Build and publish to PyPI
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        run: |
+          python setup.py sdist bdist_wheel
+          twine upload dist/*
diff --git a/.github/workflows/tox.yaml b/.github/workflows/tox.yaml
new file mode 100644
index 0000000..77c043e
--- /dev/null
+++ b/.github/workflows/tox.yaml
@@ -0,0 +1,58 @@
+name: Merge develop, run tests and build documentation
+
+on:
+  pull_request:
+    branches: [develop]
+  push:
+    branches: [develop, master]
+  workflow_dispatch:
+    inputs:
+      reason:
+        description: Why did you trigger the pipeline?
+        required: False
+        default: Check if it runs again due to external changes
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+      # pandoc needed for docu, see https://nbsphinx.readthedocs.io/en/0.7.1/installation.html?highlight=pandoc#pandoc
+      - name: Install Non-Python Packages
+        run: sudo apt-get update -yq && sudo apt-get -yq install pandoc
+      - uses: actions/checkout@v2.3.1
+        with:
+          fetch-depth: 0
+          lfs: true
+          persist-credentials: false
+      # lfs=true is not enough, see https://stackoverflow.com/questions/61463578/github-actions-actions-checkoutv2-lfs-true-flag-not-converting-pointers-to-act
+      - name: Checkout LFS Objects
+        run: git lfs pull
+      - name: Merge develop into current branch
+        if: github.ref != 'refs/heads/develop'
+        run: |
+          git fetch origin develop:develop --update-head-ok
+          git merge develop
+      - name: Setup Python 3.8
+        uses: actions/setup-python@v1
+        with:
+          python-version: "3.8"
+      - name: Install Tox and Python Packages
+        run: pip install tox
+      - name: Run Tox
+        run: tox
+      - name: Prepare Pages
+        if: github.ref == 'refs/heads/develop'
+        run: |
+          mv docs/_build/html/* public/docs
+          mv htmlcov/* public/coverage
+      - name: Deploy Pages
+        uses: JamesIves/github-pages-deploy-action@3.7.1
+        if: github.ref == 'refs/heads/develop'
+        with:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          BRANCH: gh-pages
+          FOLDER: public
+          TARGET_FOLDER: .
+          CLEAN: true
+          SINGLE_COMMIT: true
diff --git a/.gitignore b/.gitignore
old mode 100755
new mode 100644
index c1f1e8f..77b704a
--- a/.gitignore
+++ b/.gitignore
@@ -131,3 +131,8 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# reports
+pylint.html
+
+data
\ No newline at end of file
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 69e572f..9459afe 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,21 +2,137 @@ image: "python:3.8-buster"
 
 stages:
   - tox
-  - package
+  - documentation
+  - build
+  - publish
+  - update-tox-cache
 
-tox:
+variables:
+  PIP_CACHE_DIR: "$CI_PROJECT_DIR/.cache/pip"
+
+cache: &global_cache
+  paths:
+    - .cache/pip
+    - .venv/
+    - .tox
+    - apt-cache/
+  key: ${CI_COMMIT_REF_SLUG}
+
+# Pip's cache doesn't store the python packages
+# https://pip.pypa.io/en/stable/reference/pip_install/#caching
+before_script:
+  - mkdir -p apt-cache
+  # pandoc needed for docu, see https://nbsphinx.readthedocs.io/en/0.7.1/installation.html?highlight=pandoc#pandoc
+  - apt-get update -yq && apt-get -o dir::cache::archives="$(pwd)/apt-cache" -yq install pandoc
+  - if [ -e $LOCAL_CONFIG ]; then mv $CONFIG_LOCAL ./config_local.json && echo "retrieved local config"; fi
+  - pip install virtualenv
+  - virtualenv .venv
+  - source .venv/bin/activate
+
+.tox_job: &tox_job
   stage: tox
   script:
     - pip install tox
     - tox
+  artifacts:
+    paths:
+      - badges
+      - docs/_build
+      - htmlcov
+      - pylint.html
+
+tox_recreate:
+  only:
+    changes:
+      - requirements.txt
+  cache:
+    # push cache if dependencies have changed
+    <<: *global_cache
+    policy: push
+  <<: *tox_job
+
+tox_use_cache:
+  except:
+    changes:
+      - requirements.txt
+  cache:
+    # use cache if dependencies haven't changed
+    <<: *global_cache
+    policy: pull
+  <<: *tox_job
+
+pages:
+  cache: {}
+  stage: documentation
+  script:
+    - mv docs/_build/html/* public/docs
+    - mv pylint.html public/pylint/index.html
+    - mv htmlcov/* public/coverage
+  artifacts:
+    paths:
+      - public
+  only:
+    - develop
 
 package:
-  stage: package
+  cache:
+    paths:
+      - .cache/pip
+      - .venv/
+    key: "$CI_JOB_NAME-$CI_COMMIT_REF_SLUG"
+  stage: build
   script:
+    - |
+      # Bump version number of develop branch
+      if [ "$CI_COMMIT_BRANCH" = "develop" ]; then
+        # Git config
+        git config user.name "Gitlab CI"
+        git config user.email "gitlab@example.org"
+        chmod 0600 $GITLAB_DEPLOY_KEY
+
+        # HTTPS clone URL -> git+ssh URL for pushing
+        export GIT_REPO_URL_SSH=$(echo -n $CI_REPOSITORY_URL | sed -r 's%https?://.*@([^/]+)/%git@\1:%' -)
+        git remote set-url origin $GIT_REPO_URL_SSH
+        export GIT_SSH_COMMAND='ssh -i $GITLAB_DEPLOY_KEY -o IdentitiesOnly=yes -o StrictHostKeyChecking=no'
+
+        pip install bump2version
+        apt-get update && apt-get -o dir::cache::archives="$(pwd)/apt-cache" -yq install git-lfs
+
+        bump2version build --commit
+        git push -o ci.skip origin HEAD:develop
+      fi
     - pip install setuptools wheel
     - python setup.py sdist bdist_wheel
   artifacts:
     paths:
+      - dist/*.tar.gz
       - dist/*.whl
 
+publish_package:
+  cache: {}
+  only:
+    - tags
+    - develop
+  stage: publish
+  needs: [package]
+  script:
+    - pip install twine
+    - export TWINE_REPOSITORY_URL=$PYPI_REPO_URL
+    - export TWINE_USERNAME=$PYPI_REPO_USER
+    - export TWINE_PASSWORD=$PYPI_REPO_PASS
+    - twine upload dist/*
 
+update_tox_cache:
+  needs: []
+  except:
+    changes:
+      - requirements.txt
+  when: manual
+  allow_failure: true
+  cache:
+    <<: *global_cache
+    policy: push
+  stage: update-tox-cache
+  script:
+    - pip install tox
+    - tox -r
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..14d8171
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,6 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 20.8b1
+    hooks:
+      - id: black
+        language_version: python3
\ No newline at end of file
diff --git a/.pylintrc b/.pylintrc
new file mode 100644
index 0000000..11cd815
--- /dev/null
+++ b/.pylintrc
@@ -0,0 +1,9 @@
+[MESSAGE CONTROL]
+disable =
+    I0011 # reasoning
+
+[MASTER]
+load-plugins=pylint_json2html
+
+[REPORTS]
+output-format=jsonextended
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..ae71793
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,21 @@
+kyle - a python library for classifier calibration
+
+Copyright 2021-2021 by appliedAI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/README.md b/README.md
index 6ab81d3..209182e 100644
--- a/README.md
+++ b/README.md
@@ -1,51 +1,18 @@
-# kale
-
-This repository contains a library template with utilities for building, testing, documentation 
-and configuration management.
-
-## Workflow
-Automated builds, tests, generation of docu and publishing should be handled by cicd pipelines. 
-You might already have an initial version of the pipeline here. Below you will find further details on testing 
-and documentation. 
-
-Before pushing your changes to the remote it is often useful to execute `tox` locally in order to
-detect mistakes early on.
-
-We strongly suggest to use some form of virtual environment for working with the library. E.g. with conda:
-```shell script
-conda create -n kale python=3.8
-conda activate kale
-pip install -r requirements.txt
-```
-
-### Testing and packaging
-The library is tested with tox which will build and install the package and run pytest and doctest. 
-You can run it locally by installing tox into your virtual environment 
-(e.g. with `pip install tox`) and executing `tox`. 
-
-For creating a package locally run
-```shell script
-python setup.py sdist bdist_wheel
-```
-
-### Documentation
-Documentation is built with sphinx every time tox is executed. 
-There is a helper script for updating documentation files automatically. It is called by tox on built and can 
-also be invoked as
-```bash
-python scripts/update_docs.py
-```
-See the code documentation in the script for more details on that
-
-### Note
-You might wonder why the requirements.txt already contains numpy. The reason is that tox seems to have a problem with empty
-requirements files. Feel free to remove numpy once you have non-trivial requirements
-
-## Configuration Management
-The repository also includes configuration utilities that are often helpful when using data-related libraries. 
-They do not form part of the resulting package, you can (and probably should) adjust them to your needs.
-
-## CI/CD
-Depending on the provider you chose for CI/CD, this repo might already contain a rudimentary CI/CD pipeline. 
-The pipelines serve for building and testing the library and for publishing the resulting package and documentation.
-You will probably have to further adjust it to your needs.
+# Kyle - a Calibration Toolkit
+
+This library contains utils for measuring and visualizing calibration of probabilistic classifiers as well as for 
+recalibrating them. Currently, only methods for recalibration through post-processing are supported, although we plan
+to include calibration specific training algorithms as well in the future.
+
+Kyle is model agnostic, any probabilistic classifier can be wrapped with a thin wrapper called `CalibratableModel` which
+supports multiple calibration algorithms. For a quick intro overview of the API have a look at the calibration demo 
+notebook (the notebook with executed cells can be found in the docu).
+
+Apart from tools for analysing models, kyle also offers support for developing and testing custom calibration metrics
+and algorithms. In order not to have to rely on evaluation data sets and trained models for delivering labels and confidence 
+vectors, with kyle custom samplers based on [fake classifiers](our paper/review) can be constructed. These samplers can
+also be fit on some data set in case you want to mimic it. Using the fake classifiers, an arbitrary number of ground 
+truth labels and miscalibrated confidence vectors can be generated to help you analyse your algorithms (common use cases
+will be analysis of variance and bias of calibration metrics and benchmarking of recalibration algorithms). Several
+pre-configured fake classifiers mimicking common models, e.g. vision models trained on MNIST and CIFAR10, are implemented
+in kyle and can be used out of the box. 
diff --git a/badges/.gitignore b/badges/.gitignore
new file mode 100644
index 0000000..d6b7ef3
--- /dev/null
+++ b/badges/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
diff --git a/badges/coverage.svg b/badges/coverage.svg
new file mode 100644
index 0000000..565169e
--- /dev/null
+++ b/badges/coverage.svg
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
+    <linearGradient id="b" x2="0" y2="100%">
+        <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
+        <stop offset="1" stop-opacity=".1"/>
+    </linearGradient>
+    <mask id="a">
+        <rect width="99" height="20" rx="3" fill="#fff"/>
+    </mask>
+    <g mask="url(#a)">
+        <path fill="#555" d="M0 0h63v20H0z"/>
+        <path fill="#e05d44" d="M63 0h36v20H63z"/>
+        <path fill="url(#b)" d="M0 0h99v20H0z"/>
+    </g>
+    <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
+        <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
+        <text x="31.5" y="14">coverage</text>
+        <text x="80" y="15" fill="#010101" fill-opacity=".3">28%</text>
+        <text x="80" y="14">28%</text>
+    </g>
+</svg>
diff --git a/build_scripts/release-version.sh b/build_scripts/release-version.sh
new file mode 100755
index 0000000..ecf89ad
--- /dev/null
+++ b/build_scripts/release-version.sh
@@ -0,0 +1,192 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+## TTY colors and attributes
+#normal=$(tput sgr0)                      # normal text
+normal=$'\e[0m'                           # (works better sometimes)
+bold=$(tput bold)                         # make colors bold/bright
+red="$bold$(tput setaf 1)"                # bright red text
+green=$(tput setaf 2)                     # dim green text
+fawn=$(tput setaf 3); beige="$fawn"       # dark yellow text
+yellow="$bold$fawn"                       # bright yellow text
+darkblue=$(tput setaf 4)                  # dim blue text
+blue="$bold$darkblue"                     # bright blue text
+purple=$(tput setaf 5); magenta="$purple" # magenta text
+pink="$bold$purple"                       # bright magenta text
+darkcyan=$(tput setaf 6)                  # dim cyan text
+cyan="$bold$darkcyan"                     # bright cyan text
+gray=$(tput setaf 7)                      # dim white text
+darkgray="$bold"$(tput setaf 0)           # bold black = dark gray text
+white="$bold$gray"                        # bright white text
+
+
+function fail() {
+  echo "${red}$1${normal}"
+  exit 1
+}
+
+function usage() {
+  cat > /dev/stderr <<EOF
+Usage:
+  release-version.sh [FLAGS] VERSION_STR
+
+  Optional flags:
+    -d             Delete release branch after merging
+    -v, --verbose  Print debug information
+    -y, --yes      Do not prompt for confirmation, for non-interactive use
+
+  Positional options:
+    VERSION_STR   Version to release, e.g. v0.1.2.
+                  If not specified, 'bumpversion' is used to determine release version number.
+
+  Prerequisites:
+    The repository has to be clean (including no untracked files) and on the ${bold}develop${normal} branch.
+EOF
+  exit 255
+}
+
+function _parse_opts() {
+  POSITIONAL=()
+
+  DEBUG=
+  DELETE_BRANCH=
+  FORCE_YES=
+
+  while [[ $# -gt 0 ]]
+  do
+    key="$1"
+    case $key in
+        -v|--verbose)
+          DEBUG=1
+          shift
+        ;;
+        -y|--yes)
+          FORCE_YES=1
+          shift
+        ;;
+        -d|--delete-branch)
+          DELETE_BRANCH=1
+          shift
+        ;;
+        *)    # unknown option
+          POSITIONAL+=("$1") # save it in an array for later
+          shift
+        ;;
+    esac
+  done
+
+  export DEBUG
+  export DELETE_BRANCH
+  export FORCE_YES
+
+  # Infer release version if none given
+  if [[ -n "${POSITIONAL[*]}" ]]; then
+    RELEASE_VERSION="${POSITIONAL[0]}"
+  else
+    RELEASE_VERSION="$(bump2version --dry-run --list release | grep new_version | sed -r s,"^.*=",,)"
+  fi
+  export RELEASE_VERSION
+}
+
+function _check_sanity() {
+  # Make sure bumpversion can be executed
+  if [[ -z $(command -v bumpversion) ]]; then
+    fail "bumpversion not found on the path. Is the right virtualenv active?"
+  fi
+
+  # Validate we are currently in a clean repo
+  if [[ -n $(git status --porcelain) ]]; then
+    fail "Repository must be in a clean state."
+  fi
+
+  # Validate we are on the correct branch
+  local BRANCH
+  BRANCH=$(git rev-parse --abbrev-ref HEAD)
+  if [[ "$BRANCH" != "develop" ]]; then
+    fail "Repository must be on 'develop' branch, was on '$BRANCH'."
+  fi
+  unset BRANCH
+
+  # Validate version string format
+  if ! [[ $RELEASE_VERSION =~ [0-9]+\.[0-9]+\.[0-9] ]]; then
+    fail "Invalid version string '$RELEASE_VERSION'"
+  fi
+
+  # Validate that tag doesn't exist yet
+  if [[ -n $(git tag -l "$RELEASE_TAG") ]]; then
+    fail "Tag for version already exists: ${bold}$RELEASE_TAG${normal}"
+  fi
+}
+
+function _confirm() {
+  cat << EOF
+🔍 Summary of changes:
+    - Create branch ${bold}$RELEASE_BRANCH${normal}
+    - Bump version number: ${bold}$CURRENT_VERSION ⟶ $RELEASE_VERSION${normal}
+    - Merge release branch into ${bold}master${normal}
+    - Bump version number again to next development pre-release
+    - Merge release branch into ${bold}develop${normal}
+EOF
+  if [[ -n "$DELETE_BRANCH" ]]; then
+    echo "    - Delete release branch"
+  fi
+
+  echo -en "🚨️ ${yellow}Do you want to proceed? [y/N] ${normal}"
+  read -n 1 -r
+  echo
+  if [[ ! ($REPLY =~ ^[Yy]$) ]]
+  then
+      echo "Nevermind."
+      exit 255
+  fi
+}
+
+_parse_opts "$@"
+CURRENT_VERSION=$(bumpversion --dry-run --list patch | grep current_version | sed -r s,"^.*=",,)
+RELEASE_BRANCH="release/v$RELEASE_VERSION"
+RELEASE_TAG="v$RELEASE_VERSION"
+
+if [[ -n "$DEBUG" ]]; then
+  echo "DEBUG:           ${DEBUG}"
+  echo "FORCE_YES:       ${FORCE_YES}"
+  echo "RELEASE_BRANCH:  ${RELEASE_BRANCH}"
+  echo "RELEASE_TAG:     ${RELEASE_TAG}"
+  echo "CURRENT_VERSION: ${CURRENT_VERSION}"
+  echo "RELEASE_VERSION: ${RELEASE_VERSION}"
+fi
+
+_check_sanity
+
+if [[ -z "$FORCE_YES" ]]; then
+  _confirm
+fi
+
+git pull --ff-only
+
+echo "📝 Creating release branch"
+git checkout -b "$RELEASE_BRANCH"
+bumpversion --commit --new-version "$RELEASE_VERSION" release
+
+echo "🔨 Merging release branch into master"
+git checkout master
+git pull --ff-only
+git merge --no-ff -X theirs "$RELEASE_BRANCH"
+git tag -a "$RELEASE_TAG" -m"Release $RELEASE_VERSION"
+git push --follow-tags origin master
+
+echo "🏷️ Bumping to next patch version"
+git checkout "$RELEASE_BRANCH"
+bumpversion --commit patch
+
+echo "🔨 Merging release branch into develop"
+git checkout develop
+git merge --no-ff "$RELEASE_BRANCH"
+git push origin develop
+
+if [[ -n "$DELETE_BRANCH" ]]; then
+  echo "🗑️ Deleting release branch"
+  git branch -d "$RELEASE_BRANCH"
+fi
+
+echo -e "\U2728 All done! Get yourself some coffee and watch CI/CD pipelines for errors."
\ No newline at end of file
diff --git a/build_scripts/run_pylint.py b/build_scripts/run_pylint.py
new file mode 100644
index 0000000..861a93d
--- /dev/null
+++ b/build_scripts/run_pylint.py
@@ -0,0 +1,19 @@
+import sys
+
+import anybadge
+from pylint.lint import Run
+
+FAIL_THRESHOLD = -100
+
+badge_thresholds = {7: "orange", 8: "yellow", 9: "green"}
+
+results = Run(["src", "--max-line-length=120"], exit=False)
+
+score = round(results.linter.stats["global_note"], 2)
+# NOTE: we need to do this ourselves instead of using the --fail-under flag, since we want the badge
+# to be produced if we are above the threshold and therefore have to use exit=False
+if score < FAIL_THRESHOLD:
+    sys.exit(f"Pylint failed: score is below threshold {FAIL_THRESHOLD}")
+
+badge = anybadge.Badge("pylint", score, thresholds=badge_thresholds)
+badge.write_badge("badges/pylint.svg", overwrite=True)
diff --git a/scripts/update_docs.py b/build_scripts/update_docs.py
similarity index 91%
rename from scripts/update_docs.py
rename to build_scripts/update_docs.py
index 4506c25..80e832e 100644
--- a/scripts/update_docs.py
+++ b/build_scripts/update_docs.py
@@ -6,11 +6,10 @@
 
 
 def module_template(module_path: str):
-    title = os.path.basename(module_path).replace("_", r'\_')
+    title = os.path.basename(module_path).replace("_", r"\_")
     title = title[:-3]  # removing trailing .py
     module_path = module_path[:-3]
-    template = \
-        f"""{title}
+    template = f"""{title}
 {"="*len(title)}
 
 .. automodule:: {module_path.replace(os.path.sep, ".")}
@@ -22,9 +21,8 @@ def module_template(module_path: str):
 
 def package_template(package_path: str):
     package_name = os.path.basename(package_path)
-    title = package_name.replace("_", r'\_')
-    template = \
-        f"""{title}
+    title = package_name.replace("_", r"\_")
+    template = f"""{title}
 {"="*len(title)}
 
 .. automodule:: {package_path.replace(os.path.sep, ".")}
@@ -45,7 +43,7 @@ def write_to_file(content: str, path: str):
     os.chmod(path, 0o777)
 
 
-def make_docu(basedir=os.path.join("src", "kale"), overwrite=False):
+def make_docu(basedir=os.path.join("src", "kyle"), overwrite=False):
     """
     Creates/updates documentation in form of rst files for modules and packages.
     Does not delete any existing rst files. Thus, rst files for packages or modules that have been removed or renamed
diff --git a/config.py b/config.py
index af4690e..c577192 100644
--- a/config.py
+++ b/config.py
@@ -1,9 +1,7 @@
 import json
 import logging.handlers
 import os
-
-from typing import List, Union, Dict
-
+from typing import Dict, List, Union
 
 log = logging.getLogger(__name__)
 
@@ -16,6 +14,7 @@ class __Configuration:
     """
     Holds essential configuration entries
     """
+
     log = log.getChild(__qualname__)
 
     def __init__(self, config_files: List[str] = None):
@@ -30,12 +29,16 @@ def __init__(self, config_files: List[str] = None):
             file_path = os.path.join(source_path, filename)
             if os.path.exists(file_path):
                 self.log.info("Reading configuration from %s" % file_path)
-                with open(file_path, 'r') as f:
+                with open(file_path, "r") as f:
                     self.config.update(json.load(f))
         if not self.config:
-            raise Exception("No configuration entries could be read from %s" % config_files)
+            raise Exception(
+                "No configuration entries could be read from %s" % config_files
+            )
 
-    def _get_non_empty_entry(self, key: Union[str, List[str]]) -> Union[float, str, List, Dict]:
+    def _get_non_empty_entry(
+        self, key: Union[str, List[str]]
+    ) -> Union[float, str, List, Dict]:
         """
         Retrieves an entry from the configuration
 
@@ -65,10 +68,14 @@ def _get_path(self, key: Union[str, List[str]], create=False) -> str:
             if isinstance(key, list):
                 key = ".".join(key)  # purely for logging
             if create:
-                log.info(f"Configured directory {key}='{path}' not found; will create it")
+                log.info(
+                    f"Configured directory {key}='{path}' not found; will create it"
+                )
                 os.makedirs(path)
             else:
-                raise FileNotFoundError(f"Configured directory {key}='{path}' does not exist.")
+                raise FileNotFoundError(
+                    f"Configured directory {key}='{path}' does not exist."
+                )
         return path.replace("/", os.sep)
 
     @property
diff --git a/docs/.gitignore b/docs/.gitignore
new file mode 100644
index 0000000..fa65608
--- /dev/null
+++ b/docs/.gitignore
@@ -0,0 +1 @@
+*.ipynb
diff --git a/docs/conf.py b/docs/conf.py
index 0257b5f..1ccbd01 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,297 +1,323 @@
-# -*- coding: utf-8 -*-
-#
-# kale documentation build configuration file
-#
-# This file is execfile()d with the current directory set to its containing dir.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-import ast
-import logging
-import os
-import sys
-
-log = logging.getLogger("docs")
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, os.path.abspath('../src'))
-print(sys.path)
-
-# -- General configuration -----------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be extensions
-# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = ['sphinx.ext.napoleon', 'sphinx.ext.autodoc', 'sphinx.ext.doctest', 'sphinx.ext.linkcode', 'sphinx_rtd_theme']
-
-
-# adding links to source files (this works for gitlab and github like hosts and might need to be adjusted for others)
-# see https://www.sphinx-doc.org/en/master/usage/extensions/linkcode.html#module-sphinx.ext.linkcode
-def linkcode_resolve(domain, info):
-    link_prefix = "https://gitlab.aai.lab/tl/calibration/kale/blob/master"
-    if domain != 'py':
-        return None
-    if not info['module']:
-        return None
-
-    path, link_extension = get_path_and_link_extension(info['module'])
-    object_name = info['fullname']
-    if "." in object_name:  # don't add source link to methods within classes (you might want to change that)
-        return None
-    lineno = lineno_from_object_name(path, object_name)
-    return f"{link_prefix}/{link_extension}#L{lineno}"
-
-
-def get_path_and_link_extension(module: str):
-    """
-    :return: tuple of the form (path, link_extension) where
-        the first entry is the local path to a given module or to __init__.py of the package
-        and the second entry is the corresponding path from the top level directory
-    """
-    filename = module.replace('.', '/')
-    docs_dir = os.path.dirname(os.path.realpath(__file__))
-    source_path_prefix = os.path.join(docs_dir, f"../src/{filename}")
-
-    if os.path.exists(source_path_prefix + ".py"):
-        link_extension = f"src/{filename}.py"
-        return source_path_prefix + ".py", link_extension
-    elif os.path.exists(os.path.join(source_path_prefix, "__init__.py")):
-        link_extension = f"src/{filename}/__init__.py"
-        return os.path.join(source_path_prefix, "__init__.py"), link_extension
-    else:
-        raise Exception(f"{source_path_prefix} is neither a module nor a package with init - "
-                        f"did you forget to add an __init__.py?")
-
-
-def lineno_from_object_name(source_file, object_name):
-    desired_node_name = object_name.split(".")[0]
-    with open(source_file, 'r') as f:
-        source_node = ast.parse(f.read())
-    desired_node = next((node for node in source_node.body if getattr(node, "name", "") == desired_node_name), None)
-    if desired_node is None:
-        log.warning(f"Could not find object {desired_node_name} in {source_file}")
-        return 0
-    else:
-        return desired_node.lineno
-
-
-# this is useful for keeping the docs build environment small. Add heavy requirements here
-# and all other requirements to docs/requirements.txt
-autodoc_mock_imports = ["torch", "pyro"]
-
-autodoc_default_options = {
-    'exclude-members': 'log',
-    'member-order': 'bysource',
-    'show-inheritance': True
-}
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix of source filenames.
-source_suffix = '.rst'
-
-# The encoding of source files.
-# source_encoding = 'utf-8-sig'
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = u'kale'
-
-# The version info for the project you're documenting, acts as replacement for
-# |version| and |release|, also used in various other places throughout the
-# built documents.
-#
-# The short X.Y version.
-version = '0.1'
-# The full version, including alpha/beta/rc tags.
-release = '0.1.0'
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-# language = None
-
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-# today = ''
-# Else, today_fmt is used as the format for a strftime call.
-# today_fmt = '%B %d, %Y'
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-exclude_patterns = ['_build']
-
-# The reST default role (used for this markup: `text`) to use for all documents.
-# default_role = None
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-# add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-add_module_names = False
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-# show_authors = False
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# A list of ignored prefixes for module index sorting.
-# modindex_common_prefix = []
-
-
-# -- Options for HTML output ---------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-html_theme = 'sphinx_rtd_theme'
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-# html_theme_options = {}
-
-# Add any paths that contain custom themes here, relative to this directory.
-# html_theme_path = []
-
-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-# html_title = None
-
-# A shorter title for the navigation bar.  Default is the same as html_title.
-# html_short_title = None
-
-# The name of an image file (relative to this directory) to place at the top
-# of the sidebar.
-# html_logo = None
-
-# The name of an image file (within the static path) to use as favicon of the
-# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-# html_favicon = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = []
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-# html_last_updated_fmt = '%b %d, %Y'
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-# html_use_smartypants = True
-
-# Custom sidebar templates, maps document names to template names.
-# html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-# html_additional_pages = {}
-
-# If false, no module index is generated.
-# html_domain_indices = True
-
-# If false, no index is generated.
-# html_use_index = True
-
-# If true, the index is split into individual pages for each letter.
-# html_split_index = False
-
-# If true, links to the reST sources are added to the pages.
-# html_show_sourcelink = True
-
-# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
-# html_show_sphinx = True
-
-# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
-# html_show_copyright = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-# html_use_opensearch = ''
-
-# This is the file name suffix for HTML files (e.g. ".xhtml").
-# html_file_suffix = None
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = 'kale_doc'
-
-
-# -- Options for LaTeX output --------------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    # 'papersize': 'letterpaper',
-
-    # The font size ('10pt', '11pt' or '12pt').
-    # 'pointsize': '10pt',
-
-    # Additional stuff for the LaTeX preamble.
-    # 'preamble': '',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, documentclass [howto/manual]).
-# latex_documents = []
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-# latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-# latex_use_parts = False
-
-# If true, show page references after internal links.
-# latex_show_pagerefs = False
-
-# If true, show URL addresses after external links.
-# latex_show_urls = False
-
-# Documents to append as an appendix to all manuals.
-# latex_appendices = []
-
-# If false, no module index is generated.
-# latex_domain_indices = True
-
-
-# -- Options for manual page output --------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [
-    ('index', 'kale', '',
-     ["Miguel and Mischa"], 1)
-]
-
-# If true, show URL addresses after external links.
-# man_show_urls = False
-
-
-# -- Options for Texinfo output ------------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-# texinfo_documents = []
-
-# Documents to append as an appendix to all manuals.
-# texinfo_appendices = []
-
-# If false, no module index is generated.
-# texinfo_domain_indices = True
-
-# How to display URL addresses: 'footnote', 'no', or 'inline'.
-# texinfo_show_urls = 'footnote'
+# -*- coding: utf-8 -*-
+#
+# kyle documentation build configuration file
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import ast
+import logging
+import os
+import sys
+
+import pkg_resources
+
+log = logging.getLogger("docs")
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0, os.path.abspath("../src"))
+print(sys.path)
+
+# -- General configuration -----------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = [
+    "sphinx.ext.napoleon",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.doctest",
+    "sphinx.ext.linkcode",
+    "sphinx_rtd_theme",
+    "nbsphinx",
+    # see https://github.com/spatialaudio/nbsphinx/issues/24 for an explanation why this extension is necessary
+    "IPython.sphinxext.ipython_console_highlighting",
+]
+
+
+# adding links to source files (this works for gitlab and github like hosts and might need to be adjusted for others)
+# see https://www.sphinx-doc.org/en/master/usage/extensions/linkcode.html#module-sphinx.ext.linkcode
+def linkcode_resolve(domain, info):
+    link_prefix = "https://gitlab.aai.lab/tl/calibration/kyle/blob/develop"
+    if domain != "py":
+        return None
+    if not info["module"]:
+        return None
+
+    path, link_extension = get_path_and_link_extension(info["module"])
+    object_name = info["fullname"]
+    if (
+        "." in object_name
+    ):  # don't add source link to methods within classes (you might want to change that)
+        return None
+    lineno = lineno_from_object_name(path, object_name)
+    return f"{link_prefix}/{link_extension}#L{lineno}"
+
+
+def get_path_and_link_extension(module: str):
+    """
+    :return: tuple of the form (path, link_extension) where
+        the first entry is the local path to a given module or to __init__.py of the package
+        and the second entry is the corresponding path from the top level directory
+    """
+    filename = module.replace(".", "/")
+    docs_dir = os.path.dirname(os.path.realpath(__file__))
+    source_path_prefix = os.path.join(docs_dir, f"../src/{filename}")
+
+    if os.path.exists(source_path_prefix + ".py"):
+        link_extension = f"src/{filename}.py"
+        return source_path_prefix + ".py", link_extension
+    elif os.path.exists(os.path.join(source_path_prefix, "__init__.py")):
+        link_extension = f"src/{filename}/__init__.py"
+        return os.path.join(source_path_prefix, "__init__.py"), link_extension
+    else:
+        raise Exception(
+            f"{source_path_prefix} is neither a module nor a package with init - "
+            f"did you forget to add an __init__.py?"
+        )
+
+
+def lineno_from_object_name(source_file, object_name):
+    desired_node_name = object_name.split(".")[0]
+    with open(source_file, "r") as f:
+        source_node = ast.parse(f.read())
+    desired_node = next(
+        (
+            node
+            for node in source_node.body
+            if getattr(node, "name", "") == desired_node_name
+        ),
+        None,
+    )
+    if desired_node is None:
+        log.warning(f"Could not find object {desired_node_name} in {source_file}")
+        return 0
+    else:
+        return desired_node.lineno
+
+
+# this is useful for keeping the docs build environment small. Add heavy requirements here
+# and all other requirements to docs/requirements.txt
+autodoc_mock_imports = [
+    "netcal",
+    "torch",
+    "kornia",
+    "torchvision",
+    "pytorch-lightning",
+    "matplotlib",
+]
+
+autodoc_default_options = {
+    "exclude-members": "log",
+    "member-order": "bysource",
+    "show-inheritance": True,
+}
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# The suffix of source filenames.
+source_suffix = ".rst"
+
+# The encoding of source files.
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = "index"
+
+# General information about the project.
+project = "kyle"
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The full version, including alpha/beta/rc tags.
+version = pkg_resources.get_distribution(project).version
+release = version
+# The short X.Y version.
+major_v, minor_v = version.split(".")[:2]
+version = f"{major_v}.{minor_v}"
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+# language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+# today = ''
+# Else, today_fmt is used as the format for a strftime call.
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ["_build"]
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+add_module_names = False
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = "sphinx"
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+html_theme = "sphinx_rtd_theme"
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+# html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+# html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+# html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+# html_domain_indices = True
+
+# If false, no index is generated.
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = "kyle_doc"
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    # 'papersize': 'letterpaper',
+    # The font size ('10pt', '11pt' or '12pt').
+    # 'pointsize': '10pt',
+    # Additional stuff for the LaTeX preamble.
+    # 'preamble': '',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+# latex_documents = []
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+# latex_appendices = []
+
+# If false, no module index is generated.
+# latex_domain_indices = True
+
+
+# -- Options for manual page output --------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [("index", "kyle", "", ["Miguel and Mischa"], 1)]
+
+# If true, show URL addresses after external links.
+# man_show_urls = False
+
+
+# -- Options for Texinfo output ------------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+# texinfo_documents = []
+
+# Documents to append as an appendix to all manuals.
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+# texinfo_show_urls = 'footnote'
diff --git a/docs/index.rst b/docs/index.rst
index a2ca319..cb26143 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,5 +1,5 @@
-kale
-==============================
+kyle library and game
+=====================
 
 .. toctree::
    :caption: Guides and Tutorials
@@ -11,7 +11,7 @@ kale
 .. toctree::
    :caption: Modules
 
-   kale/index
+   kyle/index
 
 
 
diff --git a/docs/kyle/calibration.rst b/docs/kyle/calibration.rst
new file mode 100644
index 0000000..e5acb45
--- /dev/null
+++ b/docs/kyle/calibration.rst
@@ -0,0 +1,11 @@
+calibration
+===========
+
+.. automodule:: kyle.calibration
+   :members:
+   :undoc-members:
+
+.. toctree::
+   :glob:
+
+   calibration/*
diff --git a/docs/kyle/calibration/calibration_methods.rst b/docs/kyle/calibration/calibration_methods.rst
new file mode 100644
index 0000000..6c532bb
--- /dev/null
+++ b/docs/kyle/calibration/calibration_methods.rst
@@ -0,0 +1,11 @@
+calibration\_methods
+====================
+
+.. automodule:: kyle.calibration.calibration_methods
+   :members:
+   :undoc-members:
+
+.. toctree::
+   :glob:
+
+   calibration_methods/*
diff --git a/docs/kyle/calibration/calibration_methods/calibration_methods.rst b/docs/kyle/calibration/calibration_methods/calibration_methods.rst
new file mode 100644
index 0000000..3359389
--- /dev/null
+++ b/docs/kyle/calibration/calibration_methods/calibration_methods.rst
@@ -0,0 +1,6 @@
+calibration\_methods
+====================
+
+.. automodule:: kyle.calibration.calibration_methods.calibration_methods
+   :members:
+   :undoc-members:
diff --git a/docs/kyle/calibration/model_calibrator.rst b/docs/kyle/calibration/model_calibrator.rst
new file mode 100644
index 0000000..e5652ca
--- /dev/null
+++ b/docs/kyle/calibration/model_calibrator.rst
@@ -0,0 +1,6 @@
+model\_calibrator
+=================
+
+.. automodule:: kyle.calibration.model_calibrator
+   :members:
+   :undoc-members:
diff --git a/docs/kale/sampling.rst b/docs/kyle/datasets.rst
similarity index 51%
rename from docs/kale/sampling.rst
rename to docs/kyle/datasets.rst
index f7f5b6c..6c4b5ed 100644
--- a/docs/kale/sampling.rst
+++ b/docs/kyle/datasets.rst
@@ -1,6 +1,6 @@
-sampling
+datasets
 ========
 
-.. automodule:: kale.sampling
+.. automodule:: kyle.datasets
    :members:
    :undoc-members:
diff --git a/docs/kyle/evaluation.rst b/docs/kyle/evaluation.rst
new file mode 100644
index 0000000..42cb39a
--- /dev/null
+++ b/docs/kyle/evaluation.rst
@@ -0,0 +1,11 @@
+evaluation
+==========
+
+.. automodule:: kyle.evaluation
+   :members:
+   :undoc-members:
+
+.. toctree::
+   :glob:
+
+   evaluation/*
diff --git a/docs/kyle/evaluation/continuous.rst b/docs/kyle/evaluation/continuous.rst
new file mode 100644
index 0000000..46450c0
--- /dev/null
+++ b/docs/kyle/evaluation/continuous.rst
@@ -0,0 +1,6 @@
+continuous
+==========
+
+.. automodule:: kyle.evaluation.continuous
+   :members:
+   :undoc-members:
diff --git a/docs/kyle/evaluation/discrete.rst b/docs/kyle/evaluation/discrete.rst
new file mode 100644
index 0000000..052d6fa
--- /dev/null
+++ b/docs/kyle/evaluation/discrete.rst
@@ -0,0 +1,6 @@
+discrete
+========
+
+.. automodule:: kyle.evaluation.discrete
+   :members:
+   :undoc-members:
diff --git a/docs/kyle/index.rst b/docs/kyle/index.rst
new file mode 100644
index 0000000..02b5fb8
--- /dev/null
+++ b/docs/kyle/index.rst
@@ -0,0 +1,11 @@
+Library Modules
+===============
+
+.. automodule:: kyle
+   :members:
+   :undoc-members:
+
+.. toctree::
+   :glob:
+
+   *
diff --git a/docs/kyle/integrals.rst b/docs/kyle/integrals.rst
new file mode 100644
index 0000000..76bc52d
--- /dev/null
+++ b/docs/kyle/integrals.rst
@@ -0,0 +1,6 @@
+integrals
+=========
+
+.. automodule:: kyle.integrals
+   :members:
+   :undoc-members:
diff --git a/docs/kale/index.rst b/docs/kyle/metrics.rst
old mode 100755
new mode 100644
similarity index 51%
rename from docs/kale/index.rst
rename to docs/kyle/metrics.rst
index a67800e..87b86ce
--- a/docs/kale/index.rst
+++ b/docs/kyle/metrics.rst
@@ -1,11 +1,11 @@
-Modules
-=======
-
-.. automodule:: kale
-   :members:
-   :undoc-members:
-
-.. toctree::
-   :glob:
-
-   *
+metrics
+=======
+
+.. automodule:: kyle.metrics
+   :members:
+   :undoc-members:
+
+.. toctree::
+   :glob:
+
+   metrics/*
diff --git a/docs/kyle/metrics/calibration_metrics.rst b/docs/kyle/metrics/calibration_metrics.rst
new file mode 100644
index 0000000..6f89c80
--- /dev/null
+++ b/docs/kyle/metrics/calibration_metrics.rst
@@ -0,0 +1,6 @@
+calibration\_metrics
+====================
+
+.. automodule:: kyle.metrics.calibration_metrics
+   :members:
+   :undoc-members:
diff --git a/docs/kyle/models.rst b/docs/kyle/models.rst
new file mode 100644
index 0000000..84b73e0
--- /dev/null
+++ b/docs/kyle/models.rst
@@ -0,0 +1,11 @@
+models
+======
+
+.. automodule:: kyle.models
+   :members:
+   :undoc-members:
+
+.. toctree::
+   :glob:
+
+   models/*
diff --git a/docs/kyle/models/calibratable_model.rst b/docs/kyle/models/calibratable_model.rst
new file mode 100644
index 0000000..34e792c
--- /dev/null
+++ b/docs/kyle/models/calibratable_model.rst
@@ -0,0 +1,6 @@
+calibratable\_model
+===================
+
+.. automodule:: kyle.models.calibratable_model
+   :members:
+   :undoc-members:
diff --git a/docs/kyle/models/resnet.rst b/docs/kyle/models/resnet.rst
new file mode 100644
index 0000000..3119890
--- /dev/null
+++ b/docs/kyle/models/resnet.rst
@@ -0,0 +1,6 @@
+resnet
+======
+
+.. automodule:: kyle.models.resnet
+   :members:
+   :undoc-members:
diff --git a/docs/kyle/sampling.rst b/docs/kyle/sampling.rst
new file mode 100644
index 0000000..c504480
--- /dev/null
+++ b/docs/kyle/sampling.rst
@@ -0,0 +1,11 @@
+sampling
+========
+
+.. automodule:: kyle.sampling
+   :members:
+   :undoc-members:
+
+.. toctree::
+   :glob:
+
+   sampling/*
diff --git a/docs/kyle/sampling/fake_clf.rst b/docs/kyle/sampling/fake_clf.rst
new file mode 100644
index 0000000..2fba1b9
--- /dev/null
+++ b/docs/kyle/sampling/fake_clf.rst
@@ -0,0 +1,6 @@
+fake\_clf
+=========
+
+.. automodule:: kyle.sampling.fake_clf
+   :members:
+   :undoc-members:
diff --git a/docs/kyle/transformations.rst b/docs/kyle/transformations.rst
new file mode 100644
index 0000000..745f956
--- /dev/null
+++ b/docs/kyle/transformations.rst
@@ -0,0 +1,6 @@
+transformations
+===============
+
+.. automodule:: kyle.transformations
+   :members:
+   :undoc-members:
diff --git a/docs/kyle/util.rst b/docs/kyle/util.rst
new file mode 100644
index 0000000..6e39c97
--- /dev/null
+++ b/docs/kyle/util.rst
@@ -0,0 +1,6 @@
+util
+====
+
+.. automodule:: kyle.util
+   :members:
+   :undoc-members:
diff --git a/notebooks/calibration_demo.ipynb b/notebooks/calibration_demo.ipynb
new file mode 100644
index 0000000..250ddc4
--- /dev/null
+++ b/notebooks/calibration_demo.ipynb
@@ -0,0 +1,508 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "from sklearn import datasets\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.neural_network import MLPClassifier\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "from kyle.calibration import ModelCalibrator\n",
+    "from kyle.models import CalibratableModel\n",
+    "from kyle.metrics import ECE\n",
+    "from kyle.calibration.calibration_methods import TemperatureScaling\n",
+    "from kyle.sampling.fake_clf import DirichletFC\n",
+    "from kyle.transformations import MaxComponentSimplexAut\n",
+    "from kyle.evaluation import EvalStats"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# What is calibration?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When we talk about how good a machine learning model is, what we (generally) mean to ask is: How accurate is the model?\n",
+    "While this is a good enough metric in many cases, we are, in fact, leaving out important information about the model.\n",
+    "One such piece of information is concerned with whether the confidence of the model is in line with its accuracy.\n",
+    "If it is, we say the model is calibrated.\n",
+    "\n",
+    "To explain this concept in detail, let's begin with an example. Suppose we want to predict whether a patient has cancer.\n",
+    "We can simulate data with two classes i.e. $y \\in \\{0, 1\\}$ where $y=0$ denotes a healthy patient and $y=1$ denotes a\n",
+    "patient who has cancer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_samples = 2000\n",
+    "n_classes = 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X, y = datasets.make_classification(\n",
+    "        n_samples=n_samples,\n",
+    "        n_features=20,\n",
+    "        n_informative=7,\n",
+    "        n_redundant=10,\n",
+    "        n_classes=n_classes,\n",
+    "        random_state=42,\n",
+    "    )\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "        X, y, test_size=0.2, random_state=42\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can then train a neural network on our data:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = MLPClassifier(hidden_layer_sizes=(50, 50, 50))\n",
+    "model.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "and make predictions on new samples. Let's see how our model performs on unseen examples:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred = model.predict(X_test)\n",
+    "model_accuracy = accuracy_score(y_test, y_pred)\n",
+    "\n",
+    "f\"Model accuracy: {model_accuracy*100}%\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That seems pretty good! One might think our job here is done: After all, the model predicts whether a person has cancer\n",
+    "or not with decent accuracy.\n",
+    "Unfortunately, accuracy of a model does not tell us the full story. This is so due to the fact that at inference time, \n",
+    "for a given sample a model outputs confidence scores for each class. We then take the class with the highest confidence\n",
+    "and interpret that as the prediction of the model.\n",
+    "\n",
+    "This conversion of continuous (probability) to discrete (label) values can hide certain properties of the model.\n",
+    "To illustrate this, let's take two models -- $A$ and $B$ -- trained on the same data. Let's further assume they have\n",
+    "similar accuracy. Suppose we test both models with 10 healthy samples. $A$ assigns probabilities $(0.49, 0.51)$ to all\n",
+    "samples, whereas $B$ assigns $(0.1, 0.9)$. While $A$ & $B$ will be wrong 100% of the time, notice $A$ being much closer\n",
+    "to classifying the samples as belonging to the correct class compared to $B$."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Continuing with our previous example: Imagine that on all examples where the model was $95$% confident that the subject \n",
+    "has cancer, it was correct $70$% of the time. Intuitively, it seems there's something not quite right with the model:\n",
+    "the model is over-confident in its predictions. This notion is formalized by the concept of calibration.\n",
+    "We say a model is (strongly) calibrated when, for any confidence value $p \\in [0, 1]$,\n",
+    "prediction of a class with confidence $p$ is correct with probability $p$:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "P(\\widehat{y}=y|\\widehat{p}=p) = p \\quad \\forall p \\in [0, 1]\n",
+    "\\end{equation}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "So, is our model calibrated? As we can see in the equation above, $\\widehat{p}$ is continuous, which means we cannot\n",
+    "compute the equation with finite data. We can, however, develop empirical measures that approximate the true measure\n",
+    "of (mis)calibration.\n",
+    "\n",
+    "One simple way to get an empirical estimate of the model's accuracy and confidence is to discretize the probability\n",
+    "space. This is done by slicing $p$ into $K$ equal-sized bins. We can then calculate the accuracy and confidence for each\n",
+    "bin:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "accuracy_{B_k} = \\frac{1}{|B_k|} \\sum_{m=1}^{|B_k|}1(\\widehat{p}_m=p_m)\n",
+    "\\end{equation}\n",
+    "\n",
+    "\\begin{equation}\n",
+    "confidence_{B_k} = \\frac{1}{|B_k|} \\sum_{m=1}^{|B_k|}\\widehat{p}_m\n",
+    "\\end{equation}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can now simply calculate the weighted average difference between the accuracy and confidence of the model over all bins:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\sum_{k=1}^{K} \\frac{|B_k|}{n} \\Big|\\:accuracy_{B_k} - confidence_{B_k} \\Big|\n",
+    "\\end{equation}\n",
+    "\n",
+    "This is known as the **Expected Calibration Error** $(ECE).$ As can be seen, $ECE=0$ if a model is perfectly calibrated.\n",
+    "Let's calculate the $ECE$ for our model with $10$ bins:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ece = ECE(bins=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluate uncalibrated predictions\n",
+    "uncalibrated_confidences = model.predict_proba(X_test)\n",
+    "\n",
+    "pre_calibration_ece = ece.compute(uncalibrated_confidences, y_test)\n",
+    "\n",
+    "f\"ECE before calibration: {pre_calibration_ece}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can also visualize the extent of miscalibration by plotting the model's confidence *(x-axis)* vs. the ground truth\n",
+    "probability *(y-axis)*. For a perfectly calibrated model, the plot should be $y=x$. Let's see how our model fares:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_stats = EvalStats(y_test, uncalibrated_confidences)\n",
+    "class_labels = [i for i in range(n_classes)]\n",
+    "\n",
+    "eval_stats.plot_reliability_curves(class_labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Okay, so our model is not calibrated as $ECE>0$. Can we do anything to remedy the situation?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model calibration"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Indeed, we can improve the calibration of our model using various techniques. What's more, we don't need to train our\n",
+    "model again; many calibration techniques are post-processing methods i.e. operating on the trained model's output\n",
+    "confidence scores. The output scores for calibration are typically obtained on a validation set.\n",
+    "\n",
+    "In `kyle`, we have provided a `CalibratableModel` class which takes a model and, as the name suggests, makes it possible\n",
+    "to calibrate that model. By default, we use a technique called [*Temperature scaling*](https://arxiv.org/abs/1706.04599)\n",
+    "for calibration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create calibratable model\n",
+    "calibration_method = TemperatureScaling()\n",
+    "calibratable_model = CalibratableModel(model, calibration_method)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We also provide a `ModelCalibrator` class which holds the data to calibrate models:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create model calibrator and calibrate model\n",
+    "calibrator = ModelCalibrator(X_calibrate=X_test, \n",
+    "                             y_calibrate=y_test, \n",
+    "                             X_fit=X_train, \n",
+    "                             y_fit=y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We now have everything ready to calibrate our model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "calibrator.calibrate(calibratable_model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's see if calibrating the model improved the $ECE$ score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Passing X_test instead of X_calibrate in predict_proba() to make comparison with pre-calib model clear, \n",
+    "# same reasong for y_test in ece.compute()\n",
+    "calibrated_confidences = calibratable_model.predict_proba(X_test)\n",
+    "\n",
+    "post_calibration_ece = ece.compute(calibrated_confidences, y_test)\n",
+    "\n",
+    "f\"ECE before calibration: {pre_calibration_ece}, ECE after calibration: {post_calibration_ece}\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Great! $ECE$ has improved. Let's also plot a reliability curve to visually confirm the improvement in calibration."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_stats = EvalStats(y_test, calibrated_confidences)\n",
+    "\n",
+    "eval_stats.plot_reliability_curves(class_labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Wonderful! We have successfully improved our model's calibration."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model-agnostic calibration"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You may have noticed that to evaluate (mis)calibration of a model, we don't require the model itself.\n",
+    "Rather, it is sufficient to have the confidence scores predicted by the model.\n",
+    "This means we can abstract away the model and generate both the ground truth and confidence scores via sampling processes.\n",
+    "\n",
+    "In `kyle` we have provided samplers that simulate different kinds of calibration properties.\n",
+    "One such sampler is the `DirichletFC` class which provides calibrated ground truth and confidences by default."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sampler = DirichletFC(num_classes=2)\n",
+    "\n",
+    "# Get 1000 calibrated fake confidence scores\n",
+    "calibrated_samples = sampler.get_sample_arrays(1000)\n",
+    "ground_truth, confidences = calibrated_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's evaluate the $ECE$ for these samples:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ece.compute(confidences, ground_truth)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Wait, the $ECE>0$, how can we say that the samples are calibrated?\n",
+    "\n",
+    "As mentioned earlier, we only have finite samples so true miscalibration can only be measured asymptotically.\n",
+    "This means that the more samples we have, the more accurate would $ECE$'s estimate become.\n",
+    "We can test this by generating *5x* as many samples as before and evaluating $ECE$ again:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "calibrated_samples = sampler.get_sample_arrays(5000)\n",
+    "ground_truth, confidences = calibrated_samples\n",
+    "\n",
+    "ece.compute(confidences, ground_truth)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As expected, $ECE$ goes down with more samples.\n",
+    "\n",
+    "We can also systematically generate uncalibrated samples. For instance, the `ShiftingSimplexAutomorphism` shifts the\n",
+    "confidence scores by adding a fixed vector with positive entries to the input and normalizing the result."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def overestimating_max(x: np.ndarray):\n",
+    "    x = x.copy()\n",
+    "    mask = x > 1/2\n",
+    "    x[mask] = x[mask] - (1/4 - (1-x[mask])**2)\n",
+    "    return x\n",
+    "\n",
+    "automorphism = MaxComponentSimplexAut(overestimating_max)\n",
+    "shifted_sampler = DirichletFC(num_classes=2, simplex_automorphism=automorphism)\n",
+    "\n",
+    "# Get 1000 uncalibrated fake confidence scores\n",
+    "uncalibrated_samples = shifted_sampler.get_sample_arrays(10000)\n",
+    "ground_truth, confidences = uncalibrated_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's see if the uncalibrated nature of the samples is validated by $ECE$:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ece.compute(confidences, ground_truth)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once again, to verify that miscalibration will indeed increase with more samples, let's sample *5x* as many samples as\n",
+    "before and measure $ECE$ again:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "uncalibrated_samples = shifted_sampler.get_sample_arrays(1000)\n",
+    "ground_truth, confidences = uncalibrated_samples\n",
+    "\n",
+    "ece.compute(confidences, ground_truth)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Great! Calibration error goes up as we sample more instances."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
\ No newline at end of file
diff --git a/notebooks/evaluating_cal_methods.ipynb b/notebooks/evaluating_cal_methods.ipynb
new file mode 100644
index 0000000..ff35f00
--- /dev/null
+++ b/notebooks/evaluating_cal_methods.ipynb
@@ -0,0 +1,626 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# Note - this cell should be executed only once per session\n",
+    "import sys, os\n",
+    "\n",
+    "# in order to get top level modules and to have paths relative to repo root\n",
+    "\n",
+    "if os.path.basename(os.getcwd()) != \"notebooks\":\n",
+    "    raise Exception(f\"Wrong directory. Did you execute this cell twice?\")\n",
+    "os.chdir(\"..\")\n",
+    "sys.path.append(os.path.abspath(\".\"))\n",
+    "\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Class-wise and Reduced Calibration Methods\n",
+    "\n",
+    "In this notebook we demonstrate two new strategies for calibrating probabilistic classifiers. These strategies act\n",
+    "as wrappers around any calibration algorithm and therefore are implemented as wrappers. We test the improvements\n",
+    "in different calibration errors due to these wrappers where the non-wrapped calibration methods serve as baselines.\n",
+    "\n",
+    "The tests are performed on random forests trained on two synthetic data sets (balanced and imbalanced) as well as\n",
+    "on resnet20 trained on the CIFAR10 data set."
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "from collections import defaultdict\n",
+    "\n",
+    "from sklearn.datasets import make_classification\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "import os\n",
+    "import requests\n",
+    "import logging\n",
+    "\n",
+    "from kyle.calibration.calibration_methods import *\n",
+    "from kyle.evaluation import EvalStats\n",
+    "\n",
+    "from scipy.special import softmax\n",
+    "\n",
+    "from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score\n",
+    "\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Helper functions for evaluation"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "DEFAULT_WRAPPERS = {\n",
+    "    \"Baseline\": lambda method_factory: method_factory(),\n",
+    "    \"Class-wise\": lambda method_factory: ClassWiseCalibration(method_factory),\n",
+    "    \"Reduced\": lambda method_factory: ConfidenceReducedCalibration(method_factory()),\n",
+    "    \"Class-wise reduced\": lambda method_factory:\n",
+    "                            ClassWiseCalibration(lambda : ConfidenceReducedCalibration(method_factory())),\n",
+    "}\n",
+    "\n",
+    "DEFAULT_CV = 6\n",
+    "DEFAULT_BINS = 25\n",
+    "\n",
+    "ALL_CALIBRATION_METHOD_FACTORIES = (\n",
+    "    # TemperatureScaling,\n",
+    "    BetaCalibration,\n",
+    "    # LogisticCalibration,\n",
+    "    IsotonicRegression,\n",
+    "    HistogramBinning,\n",
+    ")\n",
+    "ALL_METRICS = (\n",
+    "    \"ECE\",\n",
+    "    \"cwECE\",\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def compute_score(scaler, confs: np.ndarray, labels: np.ndarray, bins, metric=\"ECE\"):\n",
+    "    calibrated_confs = scaler.get_calibrated_confidences(confs)\n",
+    "    eval_stats = EvalStats(labels, calibrated_confs, bins=bins)\n",
+    "    if metric == \"ECE\":\n",
+    "        return eval_stats.expected_calibration_error()\n",
+    "    elif metric == \"cwECE\":\n",
+    "        return eval_stats.class_wise_expected_calibration_error()\n",
+    "    elif isinstance(metric, int):\n",
+    "        return eval_stats.expected_marginal_calibration_error(metric)\n",
+    "    else:\n",
+    "        raise ValueError(f\"Unknown metric {metric}\")\n",
+    "\n",
+    "def get_scores(scaler, metric, cv, bins, confs, labels):\n",
+    "    scoring = lambda *args: compute_score(*args, bins=bins, metric=metric)\n",
+    "    return cross_val_score(scaler, confs, labels, scoring=scoring, cv=cv)\n",
+    "\n",
+    "def plot_scores(wrapper_scores_dict: dict, title=\"\", ax=None, y_lim=None):\n",
+    "    labels = wrapper_scores_dict.keys()\n",
+    "    scores_collection = wrapper_scores_dict.values()\n",
+    "\n",
+    "    if ax is None:\n",
+    "        plt.figure(figsize=(14,7))\n",
+    "        ax = plt.gca()\n",
+    "    ax.set_title(title)\n",
+    "    ax.boxplot(scores_collection, labels=labels)\n",
+    "    if y_lim is not None:\n",
+    "        ax.set_ylim(y_lim)\n",
+    "\n",
+    "def evaluate_calibration_wrappers(method_factory, confidences, gt_labels, wrappers_dict=None, metric=\"ECE\",\n",
+    "        cv=DEFAULT_CV, method_name=None, bins=DEFAULT_BINS, short_description=False):\n",
+    "    if method_name is None:\n",
+    "        method_name = method_factory.__name__\n",
+    "    if short_description:\n",
+    "        description = f\"{method_name}\"\n",
+    "    else:\n",
+    "        description = f\"Evaluating wrappers of {method_name} on metric {metric} with {bins} bins\\n \" \\\n",
+    "                     f\"CV with {cv} folds on {len(confidences)} data points.\"\n",
+    "    if wrappers_dict is None:\n",
+    "        wrappers_dict = DEFAULT_WRAPPERS\n",
+    "\n",
+    "    wrapper_scores_dict = {}\n",
+    "    for wrapper_name, wrapper in wrappers_dict.items():\n",
+    "        method = wrapper(method_factory)\n",
+    "        scores = get_scores(method, metric, cv=cv, bins=bins, confs=confidences, labels=gt_labels)\n",
+    "        wrapper_scores_dict[wrapper_name] = scores\n",
+    "    return wrapper_scores_dict, description\n",
+    "\n",
+    "# taken such that minimum and maximum are visible in all plots\n",
+    "DEFAULT_Y_LIMS_DICT = {\n",
+    "    \"ECE\": (0.004, 0.032),\n",
+    "    \"cwECE\": (0.005, 0.018),\n",
+    "}\n",
+    "\n",
+    "def perform_default_evaluation(confidences, gt_labels, method_factories=ALL_CALIBRATION_METHOD_FACTORIES, metrics=ALL_METRICS):\n",
+    "    evaluation_results = defaultdict(list)\n",
+    "    for metric in metrics:\n",
+    "        print(f\"Creating evaluation for {metric}\")\n",
+    "        for method_factory in method_factories:\n",
+    "            print(f\"Computing scores for {method_factory.__name__}\", end=\"\\r\")\n",
+    "            result = evaluate_calibration_wrappers(method_factory, confidences=confidences, gt_labels=gt_labels,\n",
+    "                                              metric=metric, short_description=True)\n",
+    "            evaluation_results[metric].append(result)\n",
+    "    return evaluation_results\n",
+    "\n",
+    "def plot_default_evaluation_results(evaluation_results: dict, figsize=(25, 7), y_lims_dict=None, title_addon=None):\n",
+    "    if y_lims_dict is None:\n",
+    "        y_lims_dict = DEFAULT_Y_LIMS_DICT\n",
+    "    ncols = len(list(evaluation_results.values())[0])\n",
+    "    for metric, results in evaluation_results.items():\n",
+    "        fig, axes = plt.subplots(nrows=1, ncols=ncols, figsize=figsize)\n",
+    "        y_lim = y_lims_dict[metric]\n",
+    "        if ncols == 1: # axes fails to be a list if ncols=1\n",
+    "            axes = [axes]\n",
+    "        for col, result in zip(axes, results):\n",
+    "            wrapper_scores_dict, description = result\n",
+    "            plot_scores(wrapper_scores_dict, title=description, ax=col, y_lim=y_lim)\n",
+    "\n",
+    "        title = f\"Evaluation with {metric} ({DEFAULT_CV} folds; {DEFAULT_BINS} bins)\"\n",
+    "        if title_addon is not None:\n",
+    "            title += f\"\\n{title_addon}\"\n",
+    "        fig.suptitle(title)\n",
+    "        plt.show()\n",
+    "\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Part 1: Random Forest\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Load Data"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "def get_calibration_dataset(n_classes=5, weights=None, n_samples=30000, n_informative=15,  model=RandomForestClassifier()):\n",
+    "    n_dataset_samples = 2 * n_samples\n",
+    "    test_size = 0.5\n",
+    "    X, y = make_classification(n_samples=n_dataset_samples, n_classes=n_classes,\n",
+    "                                  n_informative=n_informative, weights=weights)\n",
+    "    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size)\n",
+    "\n",
+    "    train_index, test_index = list(sss.split(X, y))[0]\n",
+    "    X_train, y_train = X[train_index], y[train_index]\n",
+    "    X_test, y_test = X[test_index], y[test_index]\n",
+    "    model.fit(X_train, y_train)\n",
+    "    confidences = model.predict_proba(X_test)\n",
+    "    y_pred = confidences.argmax(1)\n",
+    "    accuracy = accuracy_score(y_pred, y_test)\n",
+    "    print(f\"Model accuracy: {accuracy}\")\n",
+    "    return confidences, y_test"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# this takes a while\n",
+    "print(f\"Creating balanced dataset\")\n",
+    "balanced_confs, balanced_gt = get_calibration_dataset()\n",
+    "print(f\"Creating unbalanced dataset\")\n",
+    "unbalanced_confs, unbalanced_gt = get_calibration_dataset(weights=(0.3, 0.1, 0.25, 0.15))\n",
+    "\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Evaluating wrappers on a single calibration method"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "<Figure size 1008x504 with 1 Axes>",
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAz8AAAG6CAYAAAAmt/gBAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3debwddX3/8dfbhEUFQoCoyKqCLRgr1au2CnXB3Vq0WgVpUZtWrYr+fnbDRgtYsS7tzyrSWisKbsGltUWtxbYgGhckKAqI1IhQFpdAAgiIBPz8/pjvJSeHuyW5NyfJvJ6Px3ncc2a+M/OdOTPnnvd8vzMnVYUkSZIkbevuMeoKSJIkSdLmYPiRJEmS1AuGH0mSJEm9YPiRJEmS1AuGH0mSJEm9YPiRJEmS1AuGH6mnknwhyR/M0bz/Isn75mLeWiedDyRZk+Tro67PpkpyWpI3teeHJblsYNwVSZ40x8u/OckD53IZW5vh92FrluToJJ+fYvzjk1y9Oes0sOxJP4+T7Nv2zXmbu17StsjwI23h2pe+n7V/fuOPd4+6XuMm+sJQVW+uqjkJVlrPocCTgb2r6lHDI5O8OMmdA/vN5Un+aKYz35iAnOTBST6R5LokNyb5dpLXbugXt6r6UlX90oZMs4H1vNu6VdVOVXX5XC1zS5OkkhwwVZmNfR+SnJBk7dDn1g0D45Pk1UkuTnJLkqvbfvPQNv60JLcPTf+tDV/L9dblI1X1lIE6TLv+06zj3yT5XpKfJvlukmOGxldbt/H6b9QJoar637Zv3rmxdZW0juFH2jo8q/3zG3+8atQV2pokmT/qOoyb5brsB1xRVbdMUear4/sN8FzgbUl+dRbrcJckDwLOA64CHlpVC4DfAcaAnedimZPUY4t5v7dms7AdPzb0ubXrwLh3Aq8BXg3sBjwY+FfgmQNl3jY0/cM2sT6z7RbgWcAC4EXAO5M8ZqjMwwbq7wkhaQtg+JG2Ukl2SHJDksUDwxa1VqL7JFmY5DNJVrVuUZ9Jsvck8zohyYcHXu/fzlrOb69fkuTSdobz8iQva8PvDXwOuP/A2c37D85vYF4vSvK/rUVg6cCy7pnk9FbHS5P82WRdT5KcmOTk9ny7dlb17QPzuS3JbgPLXJLkf4GzW5lPJPlRa5H4YpKHDMz7tCTvSfKfbT3PTbLfwPhqZ6ovb+vw9iT3GBj/+63+a5KcNcG0r0zyPeB77az3O5L8JMlNSS4afB+H1vn+Sc5MsjrJyiR/2IYvAd4H/Hrb7idONP2gqvomcClw0MD8fy3JV9q+9K0kj2/DTwIOA96dgdbGJO9MclWr9wVJDhtYxInAV6rqtVX1w7bMy6rqhVV1w3TvwdB6T9QF6ZFJvtO28QeS7DhYNsmfJ/kR8IGp9v8p1u2uloAkC5J8sE1/ZZLXj7/f6VrUlqc7878myQ+SPH2y7Z7koHQtTTckuSTJbw2MOy3JKUk+2/a789KFyInmM75fv6S9B2uSvDzJI9O1sN2QoVbhyfbLJF9sRb7VtsELJtmO670PSfZJ8i9tu1w/vLyZSHIg8ErgqKo6u6p+XlW3tpaZt2zE/M5N8tz2/LFtGz2zvT48yYXt+YuTLJ9s/Qfm98ft2PxhkpdMttyqOr6qvltVv6iq84AvAb++ofUf8KAkX2/H1r8l2a3VZ/jz+AtJ/irJl9s+8/kke7RxOyb5cHtvbkhyfpL7bkKdpG2O4UfaSlXVz4F/AY4aGPx84Nyq+gnd8f0ButaBfYGfARvbXe4nwG8CuwAvAd6R5OGtxeHpwLUDZzevnWQehwK/BBwO/GWS8S/gxwP7Aw+k68L1u1PU41zg8e35I4EfAb/RXv86cFlVrR4o/zi6L/pPba8/BxwI3Af4BvCRofkfDfwVsAdw4QTjn0PXivFw4Ajg9wGSHAH8BfDbwCK6L0HLhqZ9NvBo4GDgKa3eD6Y7a/x84PpJ1vkM4Grg/sDzgDcneWJVnQq8nHUtO8dPMv1dkjyyLXNFe70X8FngTXRn3/8E+Ocki6pqaVuPVw21Np4PHNLKfxT4xHgIAZ4EfHKaakz3HkzlaLr38kFtPV4/MO5+rU77AS9liv1/inUbdDLde/NAuv3oGLp9f9yjgcvo9pW3AacmyfBMkmwHfBr4fFvnY4GPJBnsSnYkXXBcCKwETppmOzyabhu+APg7YCndtn8I8Pwkj2vLnnS/rKrx42a8ZeJj7fXwdhxcl3nAZ4Ar6Y7Zvej2zw11OHB1Vc3WdWqDnwuPAy5n3efC49r49Uyz/gvo1m0JcEqShdNVIMk96T6TLhka9cV0Yf9fkuw/zWyOoftM2RO4A3jXFGVfSLc/3gfYnu7Yha4FagGwD7A73WfEz6arv9QrVeXDh48t+AFcAdwM3DDw+MM27knA9wfKfhk4ZpL5HAKsGXj9BeAP2vMTgA8PjNsfKGD+JPP6V+A17fnj6b7IDI6/a34D89p7YPzXgSPb88uBpw6M+4Ph+Q2MuydwG90/9ePovthdDexE9+XxXUPLfOAU23XXVmZBe30acMbA+J2AO4F92usCnjYw/hXAf7fnnwOWDIy7B3ArsN/AtE8cGP9E4H+AXwPuMUUd92l12Hlg2F8Dp7XnLwaWTzH9i+m+RN0A/LTV42QgbfyfAx8amuYs4EXD+8gUy1hD9wUSYO3gNprBvj3Re/CmifYruuPg5QOvn0Hb91vZ24Edp1jWpPv/wLACDgDmtfkdPDDuZcAXBrbryoFx92rT3m+C5R5GF9LvMTBsGXDCwDq/b2i9vjvJOuzflrPXwLDrgRcMvP5n4P9swH55wMD4u23HwfeB7gTDKib5XBiq6wltXoOfW+e0cUuBr00z/Wl0x/rg9KdPUvZw4Nvt+X/QfYZ8rb0+F/jtiY6XSdb/Z4PrR3fi59dmsL6nt2VnYNhv0AWTXemC98WTbbu2P75l4PXBbfvNY+jzuJV9/UDZVwD/0Z7/PvAV4Fdmehz68NG3hy0/0tbh2VW168Djn9rwc4B7JXl0O6t4CPApgCT3SvKPrcvOTcAXgV2zEXcMSvL0JF9L1/XqBrovaHts4Gx+NPD8VrpwAV2LxlUD4wafr6eqfkbXavE4ui8W59L9o38sE5/hvWteSeYleUuS77ftcUUbtcdE5avqZmB1q99EdbtyYNx+dP39b2jbZzUQurPHE837bLovQ6cAP0ny3iS7TLDK9wdWV9VPh5a71wRlJ/O1ts/sTHdW+yHAmwfq/Tvj9W51P5TuzPOEkvxJ60Z1Yyu/gHXb8Ppppp3JezCVybY/wKqqum1gWZuy/+8BbNeWMbi8we1+1/5cVbe2pztxd/cHrqqqX8xkXqx/bEzmxwPPfzbB6/HpZ7JfDltvOw7ZB7iyqu6Ypn7jPj70ufWENnzK/WTA3wxN/6JJyn0VeHDr3nUI8EFgn9YV7FF07/1MXT+0ftO+H+m63i4Gnl9VNT68qr5YVbdX1+XzNcADGOhyOoHh/Xs7Jj82JttnPkR3AuOMJNcmeVtrfZTUGH6krVh1d//5OF3Xt6OAzwx8Uf5jum5mj66qXVjXDeRuXXPoLty918Dr+40/SbID3dnkvwHuW91Fy/8+MJ9i0/wQGLwWaZ9pyp9L13Lyq3RdsM6l6wo10Zecwbq9kK6r2pPovrDv34YPbo+7lp1kJ7ruP9dONJ6uK9X4uKuAlw19UbtnVX1lkrpQVe+qqkfQneF9MPCnE6zrtcBuSQZvFrAvcM0EZadVVT+mey+fNVDvDw3V+9617rqL9eqc7vqeP6Prprew7Qs3sm4b/hfdTRUmM5P3YCqTbf+71ZXp9/+p9tvr6Fqx9hsYtrHb/Vq6L+KD/283+j3cQDPZL4dNtV2uAvbNpt8I4b+BvZOMbeJ8gLvC5wV0AePiqrqd7qTIa+laB6+bjeVMJN21dk8HnlJVN01XVabe14f377V0++KMVdXaqjqxqg4GHkPXXfmYaSaTesXwI239PkrX9//o9nzcznRngW9oF85OdU3IhcBvpPs9iQXA6wbGbQ/sQNfd5Y50F3Y/ZWD8j4Hd23Qb4+PA69JdoL4XMN2d7M6l+2f+nfYl5wt03Vx+UFWrpphuZ+DndGed78W61o9Bz0hyaJLt6a79+VpVDZ6N/dNWz33ovmiNXyfwnrYOD4G7Lpb/nckqku4C9Ue3M7K30HXv+cVwubbsrwB/3S5k/hW66xA+PFx2JpLsTnfd0vh1CR8GnpXkqa1VZsd0F7iPh9Ef013zMm5num50q4D5Sf6S7jqwcccDj0l3M4j7tWUe0C7A3pWZvQdTeWWSvdv+vJR1238i0+3/w+t2l4GTCicl2TndTQJey8Zt9/Pozsz/WbqbdDyeLnxuzLUyG2q6/XLSbTCJr9OdrHhLknu3/eWxG1qpqvoe8PfAsra/bd/mdWSS4zZ0fs25dJ8d462/Xxh6PZENXf/1JHkdXaB/UlVdPzTuIUkOacfVTsDf0gXeS6eY5e8mOTjJvYA3Ap+sDby9dZInJHloa+G8iS5A3e2zReozw4+0dfh01v+9i0+Nj6juLkO30HWv+dzANH9Hd43MdcDX6PqjT6iq/pPui+S36c6gfmZg3E/pbkf7cbrrO14InDkw/rt01zBc3rrXDHZFmok30l238wO6loNP0n1BnsxX2nqNt/J8hy48TNe15YN0XUmuadN8bYIyH6X7krwaeAR3v/nCv9FtnwvpbhRwKkBVfQp4K11Xk5vo+vZPevcvusDwT3Tb80q6MPD2ScoeRddCci1dl8bjq+q/ppj3sPG7wd1M98VrFd1F9+Phavyi+FV0Z/b/lHX/G94JPC/dncLeRded5j/orle6km67D3bn+z7ddSH7A5ckuZGupWkF3TVHM3kPpvJRuhsHXA58n+5GDZOZbv8fXrdhx9IdV5cDy9uy37+B9aUF9GfR7Q/X0X3pP6YdN3NqBvvlCcDp7bh9/gzmdyfduhwA/C/dcfuCKSZ5wdDn1s1J7tPGvZp1XT9voHs/n0N3c4hxfzY07VStIOfSBd4vTvJ6IiewAes/gTfTtdCsHKjjX7Rx96X7TL2Jbh/aH/jNqlo7xfw+RHet04+AHem20Ya6H91n6E10x/u5bb6SmvGLXiVpi5DuRziPrKrHbeblnkZ3YffrJxlfwIFVtXJz1kuSJM0eW34kjVSSPdP9Nsc90t3+949pN22QJEmaTf4KtqRR2x74R7o7Id1Ady3E34+0RpIkaZtktzdJkiRJvWC3N0mSJEm9YPiRpBFK8p4kb5hi/AlJNurW1jNYdpJ8oN3x7OszKF9JDphk3IuTLJ/9Wm7b2q2erx51PaaT5Ogknx91PSRpUxl+JIm7gsCrk1yc5JYkVyf5RPvNjOOS3O2WuUn2SHJ7ksUbu9yqenlV/VWb36x8EU7ymiQ/aOtxaZIHT1L0UODJwN5V9ahNXe6oJTknyaokNyX5VpIjhsa/MMmVbbv8a/v9n/FxuyX5VBt3ZZIXzta0s7h+IwuYVfWRqnrK9CUNwpK2bIYfSeq8k+6HS18N7AY8GPhX4Jl0P275mCQPGJrmSOCiqrp4c1Z0Kkn+gO6HUJ8J7ET3C++T/T7KfsAVVXXLZqreXHsNsGdV7QK8FPhwkj2h+9FJuhtr/B7db7Dcyvo31jgFuL2NOxr4h6z7cdCNnlaStGUx/EjqvSQHAq8Ejqqqs6vq51V1azvb/Zaquho4m+7L76Bj6H64c3h+Oyb5WZI92uulSe5Iskt7/VdJ/q49Py3Jm5Lcm+5Hau8/8IOJ4z8Yu32SDyb5aZJLkoxNsh73oPuR1v9bVd+pzveravUEZZcA72Pdj6Ce2Ib/YZKVSVYnOTOT/Ghtkt3b+Jtal7kHDYxLknck+Ukbf9FkrWNJ7t/ms7ot9w8Hxp2Q5OMzWXeAqvp2Vd0x/hLYDtinvT4a+HRVfbGqbgbeAPx2kp3btn8u8IaqurmqltP9kO/vzcK0w+t7z/aer0nyHeCRQ+OPS/L9tr7fSfKcNvwg4D2se79uaMOfmeSbbTtfleSEybbPeMtikr9Icl2SK5IcPTB+QdvWq1oL1uvbPnW31px0XSBfnuR76X4k9JT2vk9Wz2e09flpkmuS/Mlk9ZSkuWT4kSQ4nO4HTqe67uV0Br7QpvtNokOAjw4XrKrbgPOB8R9qfRxwJfDYgdfnDk1zC/B04Nqq2qk9rm2jf4vuFuC70n2xfvckddy7PRa3L8I/SHLi+BfYoeWdCrwc+Gpb1vFJngj8NfB8YM9W5zMmWdYpwG2t3O+3x7inAL9B13q2oM3v+knmcwZwNXB/4HnAm1s9xs103QFI8pkktwHnAV8AVrRRDwG+NbD+36drrXlwe9xRVf8zMKtvtWk2ddphx9MFxQcBTwVeNDT++8BhdNvtRFrrVVVdyvrv166t/C10IXxXuta+P0ry7EmWDXA/YA9gr7bs97Z9GeDkttwH0u2jxwAvmWJev0kX3n6F7j1+6hT1PBV4WVXtDCymO5kgSZud4UeSYHfgh9OU+RRw3ySPaa+PAT5XVasmKX8u8Lgk8+m+HL6rvd6R7gvj3a4hmsLyqvr3qroT+BDwsEnK7d3+PgV4KPAE4Ci6bnAzcTTw/qr6RlX9HHgd3Rn8/QcLJZlH19rxl1V1S+v2d/pAkbXAzsAv0/2kwqVVdbftm2QfukD451V1W1VdSNcadcxGrDsAVfWbbdnPAD5fVb9oo3YCbhwqfmMruxNw0yTjNnXaYc8HTqqq1VV1Fd1+MVj/T1TVtVX1i6r6GPA9YNLrsarqC1V1USv/bWAZ60L3ZN7QWjfPBT4LPL+9p0cCr6uqn1bVFcDfMkkLVvOWqrqhqv4XOIfuZMBk1gIHJ9mlqtZU1TemqaMkzQnDjyR1rRJ7TlWgqm4FPgEckyR0QeFuXd4GnAs8Hng4cBHwn3RfSn8NWFlVk7WETORHA89vBXZsoWrYz9rft7UvpVfQXavyjBku5/50rT0AtC5e19O1EgxaRPcj2VcNDBuc7my6FppTgJ8kee94l78Jlre6qn46NJ/B5c103e9SVWur6nPAU5L8Vht8MzBch12An04zblOnHXZ/JtluAEmOSXJh60p2A10ryR6TzIskj866Gz3cSNfqMml5YM3QNV5XtjrtQddN8MqhccPv/aDh92anKco+l24/vDLJuUl+fYqykjRnDD+SBP8N7D3V9STN6XRn7p9Md2b/01OU/QrwS8BzgHOr6jvAvnRfAM+dZJpN/dXpy+i6Yw3OZ0PmeS3dTRAAaNez7A5cM1RuFXAH666ngW7d1i206l1V9QjgYLquYX86yfJ2SzLYSrLvBMvbWPNZdy3SJQy0GiV5ILAD8D/tMT/dtV/jHtam2dRph/2QSbZbkv2AfwJeBezeuoxdDKQVmei9/Chdd8B9qmoB3fU2maDcuIXtfR1c/rV0N8VYy8D7z8a/F3erZ1WdX1VHAPehu5HIxzdivpK0yQw/knqvqr5Hd/euZe2i8O3T3bTgyCTHDRT9EnAD8F7gjKq6fYp53gpcQHcjhfGw8xW6M/OThZ8fA7snWbCR63Er8DHgz9rF+HvT3fXsMzOcxTLgJUkOSbID8GbgvNaCNLicO4F/AU5Icq8kBzNw7UqSR7YWie3orkm5DfgFQ1q3r68Af92296/QddHb4N81SvLLSZ7ebiiwXZLfpbvuaHxbfwR4VpLD2pf/NwL/0rp43dLW541J7p3kscARdN3sNnXaYR8HXpdkYXt/jh0Yd2+64LCqrdNL6Fp+xv2YLqRvPzBsZ7rWs9uSPAqYyW22T2z7+GF01+18or2nHwdOavvOfsBr2Yj3YriebVlHJ1lQVWvpugnebX+QpM3B8CNJnVezrqvWDXQXnj+Hgdadqiq6rm77MXWXt3Hn0nUl+vrA652Z5HqfqvouXQC5vHV7mvBOa9N4FV1XrGuBr9K1DLx/JhNW1X/R3cnsn+laKB5Edx3IZMvZia7r02nABwbG7ULXgrGGruvU9cDbJ5nPUcD+rb6fAo5v9dhQAU4AfkIXHl4DvGD82pKquoQueH6kldkZeMXA9K8A7tnGLQP+qE2zSdNO4ES6bfID4PMMhKTWOvi3dO/bj+mu2/rywLRn07Uo/SjJ+O3LX0EXvH4K/CXTt6j8iO59ubatz8vbfgddELsFuBxYzgbsO0MmqufvAVckuYluWx4NkGTfdle4fSeelSTNrnT/yyVJ0rYsyeOBD1fV3tOVlaRtlS0/kiRJknrB8CNJkiSpF+z2JkmSJKkXbPmRJEmS1AtT/lDclmaPPfao/ffff9TVkCRJkrSFuuCCC66rqkUTjduqws/+++/PihUrRl0NSZIkSVuoJFdONs5ub5IkSZJ6wfAjSZIkqRcMP5IkSZJ6wfAjSZIkqRcMP5IkSZJ6wfAjSZIkqRcMP5IkSZJ6wfAjSZIkqRcMP5IkSZJ6wfAjSZIkqRcMP5IkSZJ6wfAjSZIkqRcMP5IkSZJ6wfCjGVm2bBmLFy9m3rx5LF68mGXLlo26SpIkSdIGmT/qCmjLt2zZMpYuXcqpp57KoYceyvLly1myZAkARx111IhrJ0mSJM1MqmrUdZixsbGxWrFixair0TuLFy/m5JNP5glPeMJdw8455xyOPfZYLr744hHWTJIkSVpfkguqamzCcYYfTWfevHncdtttbLfddncNW7t2LTvuuCN33nnnCGsmSZIkrW+q8OM1P5rWQQcdxPLly9cbtnz5cg466KAR1UiSJEnacIYfTWvp0qUsWbKEc845h7Vr13LOOeewZMkSli5dOuqqSZIkSTPmDQ80rfGbGhx77LFceumlHHTQQZx00kne7ECSJElbFa/5kSRJkrTN8JofSZIkSb1n+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUCzMKP0meluSyJCuTHDfB+B2SfKyNPy/J/kPj901yc5I/mek8JUmSJGk2TRt+kswDTgGeDhwMHJXk4KFiS4A1VXUA8A7grUPj/x/wuQ2cpyRJkiTNmpm0/DwKWFlVl1fV7cAZwBFDZY4ATm/PPwkcniQASZ4N/AC4ZAPnKUmSJEmzZibhZy/gqoHXV7dhE5apqjuAG4Hdk+wE/Dlw4kbME4AkL02yIsmKVatWzaC6kiRJknR3c33DgxOAd1TVzRs7g6p6b1WNVdXYokWLZq9mkiRJknpl/gzKXAPsM/B67zZsojJXJ5kPLACuBx4NPC/J24BdgV8kuQ24YAbzlCRJkqRZM5Pwcz5wYJIH0AWUI4EXDpU5E3gR8FXgecDZVVXAYeMFkpwA3FxV724Babp5SpIkSdKsmTb8VNUdSV4FnAXMA95fVZckeSOwoqrOBE4FPpRkJbCaLsxs8Dw3cV0kSZIkaVLpGmi2DmNjY7VixYpRV0OSJEnSFirJBVU1NtG4ub7hgSRJkiRtEQw/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpF+aPugKSNNeSjLoKG6yqRl0FSZK2OYYfSdu8uQoSSQwpkiRtRez2JkmSJKkXDD+SJEmSesHwI0mSJKkXDD+SJEmSesHwI0mSJKkXvNvbNspb+0qSJEnrM/xso7y1ryRJkrQ+u71JkiRJ6gVbfkZst912Y82aNaOuxgbZmrrULVy4kNWrV4+6GpIkSdoCGH5GbM2aNXYjm0NbU1CTJEnS3JpRt7ckT0tyWZKVSY6bYPwOST7Wxp+XZP82/FFJLmyPbyV5zsA0VyS5qI1bMVsrJEmSJEkTmbblJ8k84BTgycDVwPlJzqyq7wwUWwKsqaoDkhwJvBV4AXAxMFZVdyTZE/hWkk9X1R1tuidU1XWzuUKSJEmSNJGZtPw8ClhZVZdX1e3AGcARQ2WOAE5vzz8JHJ4kVXXrQNDZEbB/lyRJkqSRmEn42Qu4auD11W3YhGVa2LkR2B0gyaOTXAJcBLx8IAwV8PkkFyR56cavgiRJkiRNb85veFBV5wEPSXIQcHqSz1XVbcChVXVNkvsA/5nku1X1xeHpWzB6KcC+++4719WVJEmStI2aScvPNcA+A6/3bsMmLJNkPrAAuH6wQFVdCtwMLG6vr2l/fwJ8iq573d1U1XuraqyqxhYtWjSD6kqSJEnS3c0k/JwPHJjkAUm2B44Ezhwqcybwovb8ecDZVVVtmvkASfYDfhm4Ism9k+zcht8beArdzREkSZIkaU5M2+2t3antVcBZwDzg/VV1SZI3Aiuq6kzgVOBDSVYCq+kCEsChwHFJ1gK/AF5RVdcleSDwqfYbLPOBj1bVf8z2ykmSJEnSuGxNP7A5NjZWK1ZsWz8JlMQfOZ1Dbl/NJfcvSZK2PEkuqKqxicbN6EdOJUmSJGlrZ/iRJEmS1AuGH0mSJEm9YPiRJEmS1AuGH0mSJEm9MO2trjW36vhd4IQFo67GNquO32XUVZAkSdIWwvAzYjnxJm+VO4eSUCeMuhaSJEnaEtjtTZIkSVIvGH4kSZIk9YLhR5IkSVIvGH4kSZIk9YLhR5IkSVIvGH4kSZIk9YLhR5IkSVIvGH4kSZIk9YLhR5IkSVIvGH4kSZIk9YLhR5IkSVIvGH4kSZIk9YLhR5IkSVIvGH4kSZIk9YLhR5IkSVIvzB91BSRp3G677caaNWtGXY0NkmTUVZixhQsXsnr16lFXQ5KkkTH8SNpirFmzhqoadTW2WVtTUJMkaS7Y7U2SJElSLxh+JEmSJPWC4UeSJElSLxh+JC70Jn4AABZTSURBVEmSJPWC4UeSJElSLxh+JEmSJPWC4UeSJElSLxh+JEmSJPWC4UeSJElSL8wfdQUkaVwdvwucsGDU1dhm1fG7jLoKkiSNlOFH0hYjJ95EVY26GtusJNQJo66FJEmjY7c3SZIkSb1gy88WIMmoq7DNWrhw4airIEmSpC2E4WfEtrYuPkm2ujpLkiRJYLc3SZIkST1h+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUCzMKP0meluSyJCuTHDfB+B2SfKyNPy/J/m34o5Jc2B7fSvKcmc5TkiRJkmbTtOEnyTzgFODpwMHAUUkOHiq2BFhTVQcA7wDe2oZfDIxV1SHA04B/TDJ/hvOUJEmSpFkzk5afRwErq+ryqrodOAM4YqjMEcDp7fkngcOTpKpurao72vAdgdqAeUqSJEnSrJlJ+NkLuGrg9dVt2IRlWti5EdgdIMmjk1wCXAS8vI2fyTxp0780yYokK1atWjWD6kqSJEnS3c35DQ+q6ryqegjwSOB1SXbcwOnfW1VjVTW2aNGiuamkJEmSpG3eTMLPNcA+A6/3bsMmLJNkPrAAuH6wQFVdCtwMLJ7hPCVJkiRp1swk/JwPHJjkAUm2B44Ezhwqcybwovb8ecDZVVVtmvkASfYDfhm4YobzlCRJkqRZM3+6AlV1R5JXAWcB84D3V9UlSd4IrKiqM4FTgQ8lWQmspgszAIcCxyVZC/wCeEVVXQcw0Txned0kSZIk6S6pqulLbSHGxsZqxYoVo65GryVha9pntHVx/5pbbl9JUh8kuaCqxiYaN+c3PJAkSZKkLYHhR5IkSVIvGH4kSZIk9YLhR5IkSVIvGH4kSZIk9YLhR5IkSVIvGH4kSZIk9YLhR5IkSVIvGH4kSZIk9YLhR5IkSVIvGH4kSZIk9YLhR5IkSVIvzB91BSRpUJJRV2GbtXDhwlFXQZKkkTL8SNpiVNWoq7BBkmx1dZYkqc/s9iZJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknrB8CNJkiSpFww/kiRJknph/qgrIElzLclWN++qmpP5SpLUZ4YfSds8g4QkSQK7vUmSJEnqCcOPJEmSpF4w/EiSJEnqBcOPJEmSpF7whgfbKO9uJUmSJK3P8LONMkhIkiRJ67PbmyRJkqReMPxIkiRJ6gXDjyRJkqReMPxIkiRJ6gXDjyRJkqReMPxIkiRJ6gXDjyRJkqReMPxIkiRJ6gXDjyRJkqReMPxIkiRJ6gXDjyRJkqReMPxIkiRJ6gXDjyRJkqReMPxIkiRJ6gXDjyRJkqReMPxIkiRJ6oUZhZ8kT0tyWZKVSY6bYPwOST7Wxp+XZP82/MlJLkhyUfv7xIFpvtDmeWF73Ge2VkqSJEmShs2frkCSecApwJOBq4Hzk5xZVd8ZKLYEWFNVByQ5Engr8ALgOuBZVXVtksXAWcBeA9MdXVUrZmldJEmSJGlSM2n5eRSwsqour6rbgTOAI4bKHAGc3p5/Ejg8Sarqm1V1bRt+CXDPJDvMRsUlSZIkaUPMJPzsBVw18Ppq1m+9Wa9MVd0B3AjsPlTmucA3qurnA8M+0Lq8vSFJJlp4kpcmWZFkxapVq2ZQXUmSJEm6u81yw4MkD6HrCveygcFHV9VDgcPa4/cmmraq3ltVY1U1tmjRormvrCRJkqRt0kzCzzXAPgOv927DJiyTZD6wALi+vd4b+BRwTFV9f3yCqrqm/f0p8FG67nWSJEmSNCdmEn7OBw5M8oAk2wNHAmcOlTkTeFF7/jzg7KqqJLsCnwWOq6ovjxdOMj/JHu35dsBvAhdv2qpIkiRJ0uSmDT/tGp5X0d2p7VLg41V1SZI3JvmtVuxUYPckK4HXAuO3w34VcADwl0O3tN4BOCvJt4EL6VqO/mk2V0ySJEmSBqWqRl2HGRsbG6sVK7wztiRJkqSJJbmgqsYmGrdZbnggSZIkSaNm+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZK0UZYtW8bixYuZN28eixcvZtmyZaOukiRNaf6oKyBJkrY+y5YtY+nSpZx66qkceuihLF++nCVLlgBw1FFHjbh2kjSxVNWo6zBjY2NjtWLFilFXQ5Kk3lu8eDEnn3wyT3jCE+4ads4553Dsscdy8cUXj7BmkvouyQVVNTbhOMOPJEnaUPPmzeO2225ju+22u2vY2rVr2XHHHbnzzjtHWDNJfTdV+PGaH0mStMEOOuggli9fvt6w5cuXc9BBB42oRpI0PcOPJEnaYEuXLmXJkiWcc845rF27lnPOOYclS5awdOnSUVdNkiblDQ8kSdIGG7+pwbHHHsull17KQQcdxEknneTNDiRt0bzmR5IkSdI2w2t+JEmSJPWe4UeSJElSLxh+JEmSJPWC4UeSJElSLxh+JEmSJPWC4UeSJElSLxh+JEmSJPWC4UeSJElSLxh+JEmSJPWC4UeSJElSLxh+JEmSJPWC4UeSJElSLxh+JEmSJPWC4UeSJElSLxh+JEmSJPWC4UeSJElSL8wo/CR5WpLLkqxMctwE43dI8rE2/rwk+7fhT05yQZKL2t8nDkzziDZ8ZZJ3JclsrZQkSZIkDZs2/CSZB5wCPB04GDgqycFDxZYAa6rqAOAdwFvb8OuAZ1XVQ4EXAR8amOYfgD8EDmyPp23CekiSJEnSlGbS8vMoYGVVXV5VtwNnAEcMlTkCOL09/yRweJJU1Ter6to2/BLgnq2VaE9gl6r6WlUV8EHg2Zu8NpIkSZI0iZmEn72AqwZeX92GTVimqu4AbgR2HyrzXOAbVfXzVv7qaeYJQJKXJlmRZMWqVatmUF1JkiRJurvNcsODJA+h6wr3sg2dtqreW1VjVTW2aNGi2a+cJEmSpF6YSfi5Bthn4PXebdiEZZLMBxYA17fXewOfAo6pqu8PlN97mnlKkiRJ0qyZSfg5HzgwyQOSbA8cCZw5VOZMuhsaADwPOLuqKsmuwGeB46rqy+OFq+qHwE1Jfq3d5e0Y4N82cV0kSZIkaVLThp92Dc+rgLOAS4GPV9UlSd6Y5LdasVOB3ZOsBF4LjN8O+1XAAcBfJrmwPe7Txr0CeB+wEvg+8LnZWilJkiRJGpbuZmtbh7GxsVqxYsWoqyFJkiRpC5Xkgqoam2jcZrnhgSRJkiSNmuFHkiRJUi8YfiRJkiT1guFHkiRJUi8YfiRJkiT1guFHkiRJUi8YfiRJkiT1guFHkiRJUi8YfiRJkiT1guFHkiRJUi8YfiRJkiT1guFHkiRJUi8YfiRJkiT1guFHkiRJUi8YfiRJkiT1guFHkiRJUi8YfiRJkiT1guFHkiRJUi8YfiRJkiT1guFHkiRJUi8YfiRJkiT1guFHkiRJUi8YfiRJkiT1wvxRV0CSJG0eSUZdhQ1WVaOugqRtiOFHkqSemKsgkcSQImmrYLc3SZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC97qWpKkLcxuu+3GmjVrRl2NDbI1/YbQwoULWb169airIWkEDD+SJG1h1qxZ4+/mzKGtKahJml12e5MkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC/NHXQFJkiRpc0oy6ipssKoadRW2CYYfSZIk9cpcBYkkhpQtnN3eJEmSJPWCLT+SJEnaIu22226sWbNm1NXYIFtTl7qFCxeyevXqUVdjszL8SJIkaYu0Zs0au5HNoa0pqM2WGXV7S/K0JJclWZnkuAnG75DkY238eUn2b8N3T3JOkpuTvHtomi+0eV7YHveZjRWSJEmSpIlM2/KTZB5wCvBk4Grg/CRnVtV3BootAdZU1QFJjgTeCrwAuA14A7C4PYYdXVUrNnEdJEmSJGlaM2n5eRSwsqour6rbgTOAI4bKHAGc3p5/Ejg8SarqlqpaTheCJEmSJGlkZhJ+9gKuGnh9dRs2YZmqugO4Edh9BvP+QOvy9oZM0ukwyUuTrEiyYtWqVTOYpSRJkiTd3ShvdX10VT0UOKw9fm+iQlX13qoaq6qxRYsWbdYKSpIkSdp2zCT8XAPsM/B67zZswjJJ5gMLgOunmmlVXdP+/hT4KF33OkmSJEmaEzMJP+cDByZ5QJLtgSOBM4fKnAm8qD1/HnB2TXFfwiTzk+zRnm8H/CZw8YZWXpIkSZJmatq7vVXVHUleBZwFzAPeX1WXJHkjsKKqzgROBT6UZCWwmi4gAZDkCmAXYPskzwaeAlwJnNWCzzzgv4B/mtU1kyRpK1XH7wInLBh1NbZZdfwuo66CpBHJ1vTDUWNjY7VihXfGliRt25L4w45zyO279fC9mlvb6vZNckFVjU00bpQ3PJAkSZKkzWbabm+SJGnzm+QXIDQLFi5cOOoqSBoRw48kSVuYra0byrbadUbStsfwI0mSpC2SN/+YW328+YfhR5IkSVuknHiTrYpzKAl1wqhrsXl5wwNJkiRJvWD4kSRJktQLhh9JkiRJvWD4kSRJktQLhh9JkiRJveDd3iRJkrTF8gd/504ff/DX8CNJkqQt0tZ2m2t/8HfLZ7c3SZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb0wf9QVkCRJm0eSrW7eVTUn85XUT4YfSZJ6wiAhqe8MP5IkSeoVW0H7y/AjSZKkXjFI9Jc3PJAkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC4YfSZIkSb1g+JEkSZLUC6mqUddhxpKsAq4cdT16bg/gulFXQtpCeDxIHY8FqeOxsGXYr6oWTTRiqwo/Gr0kK6pqbNT1kLYEHg9Sx2NB6ngsbPns9iZJkiSpFww/kiRJknrB8KMN9d5RV0Dagng8SB2PBanjsbCF85ofSZIkSb1gy48kSZKkXjD8SJIkSeoFw882LsmdSS5M8q0k30jymFme/2lJnteevy/JwbM5f2ljJblfkjOSfD/JBUn+PcmDk1w8wjr9e5JdR7V8Cdb7v3Bxkk9v6D6Z5Oa5qlub/13/V9RPU3x+79/Xz3CPu9kzf9QV0Jz7WVUdApDkqcBfA4+biwVV1R/MxXylDZUkwKeA06vqyDbsYcB9R1mvqnrGKJcvNYP/F04HXgmcNNoqSZ1pPr+vGmXdZvszPMn8qrpjNuep6dny0y+7AGsAkuyU5L9ba9BFSY5ow++d5LOtpejiJC9owx+R5Nx2BuasJHsOzzzJF5KMtec3JzmpzedrSe7bhi9K8s9Jzm+Px262tVefPAFYW1XvGR9QVd9i4B9nO4P4pXYM3NUqmmTPJF8cODN+WJJ57azYxe14+b/DC0zyp0le3Z6/I8nZ7fkTk3ykPb8iyR6bcpxJs+yrwF4ASR6U5D/a/velJL/chj8gyVfbvv+m8QmTPD7JZwZevzvJi9vzRyb5StvHv55k53Ycvb199n87ycta2bRpL0vyX8B9NuP6a8sz4ed3VX1psNDW+hne6vGeJOcBb/O42/xs+dn23TPJhcCOwJ7AE9vw24DnVNVNSfYAvpbkTOBpwLVV9UyAJAuSbAecDBxRVavaQX4S8PtTLPfewNeqammStwF/CLwJeCfwjqpanmRf4CzgoNleafXeYuCCacr8BHhyVd2W5EBgGTAGvBA4q6pOSjIPuBdwCLBXVS0GyMTdHr4E/DHwrjafHdqxcxjwxaGys3WcSRut7d+HA6e2Qe8FXl5V30vyaODv6f5nvBP4h6r6YJJXzmC+2wMfA15QVecn2QX4GbAEuLGqHplkB+DLST4P/CrwS8DBdGf3vwO8fzbXVVuVmXx+w9b9Gb438JiqujPJf+Nxt1kZfrZ9g90bfh34YJLFQIA3J/kN4Bd0Z/7uC1wE/G2StwKfqaovtfKLgf9MAjAP+OE0y70dGD8zcQHw5Pb8ScDBbT4AuyTZqarmtC+rNIHtgHcnOQS4E3hwG34+8P72j+xfq+rCJJcDD0xyMvBZ4PMTzO8C4BHtH87PgW/Q/QM9DHj1UNnZOs6kjTF+Umwv4FK6fW4n4DHAJwY+n3dofx8LPLc9/xDw1mnm/0vAD6vqfICqugkgyVOAX8m66woWAAcCvwEsq6o7gWvHz7hL09iaP8M/0YKPx90IGH56pKq+2lp5FgHPaH8fUVVrk1wB7FhV/5Pk4W38m9oZiU8Bl1TVr2/A4tbWuh+RupN1+9o9gF+rqttmYZWkyVwCTHfh5v8Ffgw8jG6/vA2gqr7YTgo8Ezgtyf9rZ94eBjwVeDnw/CTHA59u83pPVb0nyQ+AFwNfAb5N133jALovmHeZxeNM2hg/q6pDktyLrvX9lcBpwA3jJ8smMNGPAt7B+t3nd5xmuQGOraqz1huYeC2cBs3k8xu27s/wW9rfe+Bxt9l5zU+PtH6k84Dr6ZL/T1rweQKwXytzf+DWqvow8Hbg4cBlwKLWckSS7ZI8ZCOr8Xng2IE6TXbAS5vibLouCy8dH5DkV4B9BsosoDtL9gvg9+iODZLsB/y4qv4JeB/w8HbS4B5V9c/A64GHV9VVVXVIe4z3Tf8S8Cd0XSS+RPdP9psDJwLG6zLXx5k0raq6le6M9h8DtwI/SPI7cNf1AA9rRb8MHNmeHz0wiyvpWvJ3aN2IDm/DLwP2TPLINq+dk8ynC1p/1M7Ik+7uXfemO15ekO7ahD3pvnCqvyb8/E5y2FC5rf4zvLXOeNxtZrb8bPvGuzdAl/5f1JpaPwJ8OslFwArgu63MQ4G3J/kFsBb4o6q6vTWXvivJArr95u/ozs5sqFcDpyT5dpvPF+k+XKRZU1WV5DnA3yX5c7ozglcA/2eg2N8D/5zkGOA/WHcm7vHAnyZZC9wMHEPXPegDScZPGL1ukkV/CVgKfLWqbklyWxs2bK6PM2lGquqb7fP4KLovWP+Q5PV0XYrOAL4FvAb4aDuW/m1g2quSfBy4GPgB8M02/PZ01zucnOSedNcdPInui+j+wDfS9fFZBTyb7oz5E+muOfhfupswqKdm+PkN285nuMfdZpahMCtJkiRJ2yS7vUmSJEnqBcOPJEmSpF4w/EiSJEnqBcOPJEmSpF4w/EiSJEnqBcOPJEmSpF4w/EiSJEnqhf8PVSpPxQTihG4AAAAASUVORK5CYII=\n"
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "execution_count": 6
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "balanced_scores_ECE, description = evaluate_calibration_wrappers(HistogramBinning, confidences=balanced_confs,\n",
+    "                                                    gt_labels=balanced_gt, metric=\"ECE\", cv=4)\n",
+    "\n",
+    "plot_scores(balanced_scores_ECE, title=description)\n",
+    "plt.show()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "unbalanced_scores_ECE, description = evaluate_calibration_wrappers(TemperatureScaling, confidences=unbalanced_confs,\n",
+    "                                                    gt_labels=unbalanced_gt, metric=\"ECE\")\n",
+    "\n",
+    "plot_scores(unbalanced_scores_ECE, title=description)\n",
+    "plt.show()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Evaluating wrappers on multiple metrics and plotting next to each other"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "eval_results = perform_default_evaluation(confidences=balanced_confs, gt_labels=balanced_gt)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "plot_default_evaluation_results(eval_results, title_addon=\"Balanced\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "unbalanced_eval_results = perform_default_evaluation(confidences=unbalanced_confs, gt_labels=unbalanced_gt)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "plot_default_evaluation_results(unbalanced_eval_results, title_addon=\"Unbalanced\")\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Part 2: Resnet\n",
+    "\n",
+    "Here we will repeat the evaluation of calibration methods on a neural network, specifically\n",
+    "on resnet20 trained on the CIFAR10 data set.\n",
+    "\n",
+    "Important: in order to run the resnet part you will need the packages from `requirements-torch.txt`"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "from kyle.models.resnet import load_weights, resnet20, resnet56\n",
+    "from kyle.datasets import get_cifar10_dataset"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "selected_resnet = \"resnet20\"\n",
+    "\n",
+    "weights_file_names = {\n",
+    "    \"resnet20\": \"resnet20-12fca82f.th\",\n",
+    "    \"resnet56\": \"resnet56-4bfd9763.th\",\n",
+    "}\n",
+    "\n",
+    "models_dict = {\n",
+    "    \"resnet20\": resnet20(),\n",
+    "    \"resnet56\": resnet56(),\n",
+    "}\n",
+    "\n",
+    "\n",
+    "resnet_path = os.path.join(\"data\", \"artifacts\", weights_file_names[selected_resnet])\n",
+    "cifar_10_data_path = os.path.join(\"data\", \"raw\", \"cifar10\")\n",
+    "logits_save_path = os.path.join(\"data\", \"processed\", \"cifar10\", f\"logits_{selected_resnet}.npy\")\n",
+    "\n",
+    "if not os.path.isfile(resnet_path):\n",
+    "    print(f\"Downloading weights for {selected_resnet} to {os.path.abspath(resnet_path)}\")\n",
+    "    os.makedirs(os.path.dirname(resnet_path), exist_ok=True)\n",
+    "    url = f\"https://github.com/akamaster/pytorch_resnet_cifar10/raw/master/pretrained_models/{weights_file_names[selected_resnet]}\"\n",
+    "    r = requests.get(url)\n",
+    "    with open(resnet_path, 'wb') as file:\n",
+    "        file.write(r.content)\n",
+    "\n",
+    "resnet = models_dict[selected_resnet]\n",
+    "load_weights(resnet_path, resnet)\n",
+    "resnet.eval()\n",
+    "\n",
+    "def get_cifar10_confidences():\n",
+    "    cifar_10_X, cifar_10_Y = get_cifar10_dataset(cifar_10_data_path)\n",
+    "\n",
+    "    if os.path.isfile(logits_save_path):\n",
+    "        logits = np.load(logits_save_path)\n",
+    "    else:\n",
+    "        # processing all at once may not fit into ram\n",
+    "        batch_boundaries = range(0, len(cifar_10_X) +1, 1000)\n",
+    "\n",
+    "        logits = []\n",
+    "        for i in range(len(batch_boundaries) - 1):\n",
+    "            print(f\"Processing batch {i+1}/{len(batch_boundaries)-1}\", end=\"\\r\")\n",
+    "            lower, upper = batch_boundaries[i], batch_boundaries[i+1]\n",
+    "            logits.append(resnet(cifar_10_X[lower:upper]).detach().numpy())\n",
+    "\n",
+    "        logits = np.vstack(logits)\n",
+    "        os.makedirs(os.path.dirname(logits_save_path), exist_ok=True)\n",
+    "        np.save(logits_save_path, logits, allow_pickle=False)\n",
+    "\n",
+    "\n",
+    "    confidences = softmax(logits, axis=1)\n",
+    "    gt_labels = cifar_10_Y.numpy()\n",
+    "    return confidences, gt_labels"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "cifar_confs, cifar_gt = get_cifar10_confidences()\n",
+    "\n",
+    "## Evaluating wrappers on a single calibration method"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "resnet_scores_ECE, description = evaluate_calibration_wrappers(HistogramBinning, confidences=cifar_confs,\n",
+    "                                                    gt_labels=cifar_gt, metric=\"ECE\", cv=4)\n",
+    "\n",
+    "plot_scores(resnet_scores_ECE, title=description)\n",
+    "plt.show()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "resnet_scores_ECE, description = evaluate_calibration_wrappers(TemperatureScaling, confidences=cifar_confs,\n",
+    "                                                    gt_labels=cifar_gt, metric=\"ECE\", cv=4)\n",
+    "\n",
+    "plot_scores(resnet_scores_ECE, title=description)\n",
+    "plt.show()"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Evaluating wrappers on multiple metrics and plotting next to each other"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "eval_results = perform_default_evaluation(confidences=balanced_confs, gt_labels=balanced_gt)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "plot_default_evaluation_results(eval_results, title_addon=f\"{selected_resnet} on CIFAR10\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/notebooks/fake_classifiers.ipynb b/notebooks/fake_classifiers.ipynb
new file mode 100644
index 0000000..39a9a2e
--- /dev/null
+++ b/notebooks/fake_classifiers.ipynb
@@ -0,0 +1,416 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "from kyle.sampling.fake_clf import DirichletFC\n",
+    "from kyle.evaluation import EvalStats, compute_accuracy, compute_ECE, compute_expected_max\n",
+    "from kyle.transformations import *\n",
+    "\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import logging\n",
+    "\n",
+    "logging.basicConfig(level=logging.INFO)\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "n_samples = 100000"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Dirichlet fake classifiers\n",
+    "\n",
+    "Add explanation about the model and integrals\n",
+    "\n",
+    "## Computing properties with integrals\n",
+    "\n",
+    "The asymptotic values for ECE and accuracy can be computed through (numerical or analytical)\n",
+    "integration."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "\n",
+    "n_classes = 3\n",
+    "alpha=[0.2, 0.3, 0.4]\n",
+    "\n",
+    "dirichlet_fc = DirichletFC(n_classes, alpha=alpha)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "print(\"mostly underestimating all classes (starting at 1/n_classes) with PowerLawSimplexAut\")\n",
+    "transform = PowerLawSimplexAut(np.array([2, 2, 2]))\n",
+    "dirichlet_fc.set_simplex_automorphism(transform)\n",
+    "eval_stats = EvalStats(*dirichlet_fc.get_sample_arrays(n_samples))\n",
+    "\n",
+    "print(f\"Accuracy is {eval_stats.accuracy()}\")\n",
+    "print(f\"ECE is {eval_stats.expected_calibration_error()}\")\n",
+    "ece_approx = - eval_stats.expected_confidence() + eval_stats.accuracy()\n",
+    "print(f\"{ece_approx=}\")\n",
+    "eval_stats.plot_reliability_curves([0, 1, EvalStats.TOP_CLASS_LABEL], display_weights=True)\n",
+    "\n",
+    "\n",
+    "theoretical_acc = compute_accuracy(dirichlet_fc)[0]\n",
+    "theoretical_ece = compute_ECE(dirichlet_fc)[0]\n",
+    "print(f\"{theoretical_acc=} , {theoretical_ece=}\")\n",
+    "\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "print(\"mostly overestimating all classes (starting at 1/n_classes) with PowerLawSimplexAut\")\n",
+    "print(\"Note the variance and the resulting sensitivity to binning\")\n",
+    "\n",
+    "transform = PowerLawSimplexAut(np.array([0.3, 0.1, 0.2]))\n",
+    "dirichlet_fc.set_simplex_automorphism(transform)\n",
+    "eval_stats = EvalStats(*dirichlet_fc.get_sample_arrays(n_samples), bins=500)\n",
+    "\n",
+    "print(f\"Accuracy is {eval_stats.accuracy()}\")\n",
+    "print(f\"ECE is {eval_stats.expected_calibration_error()}\")\n",
+    "ece_approx = eval_stats.expected_confidence() - eval_stats.accuracy()\n",
+    "print(f\"{ece_approx=}\")\n",
+    "eval_stats.plot_reliability_curves([0, 1, EvalStats.TOP_CLASS_LABEL], display_weights=True)\n",
+    "\n",
+    "\n",
+    "# theoretical_acc = compute_accuracy(dirichlet_fc)[0]\n",
+    "# theoretical_ece = compute_ECE(dirichlet_fc)[0]\n",
+    "# print(f\"{theoretical_acc=} , {theoretical_ece=}\")\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "print(\"Overestimating predictions with MaxComponent\")\n",
+    "\n",
+    "def overestimating_max(x: np.ndarray):\n",
+    "    x = x.copy()\n",
+    "    mask = x > 1/2\n",
+    "    x[mask] = x[mask] - (1/4 - (1-x[mask])**2)\n",
+    "    return x\n",
+    "\n",
+    "transform = MaxComponentSimplexAut(overestimating_max)\n",
+    "dirichlet_fc.set_simplex_automorphism(transform)\n",
+    "eval_stats = EvalStats(*dirichlet_fc.get_sample_arrays(n_samples))\n",
+    "\n",
+    "print(f\"Accuracy is {eval_stats.accuracy()}\")\n",
+    "print(f\"ECE is {eval_stats.expected_calibration_error()}\")\n",
+    "eval_stats.plot_reliability_curves([0, 1, EvalStats.TOP_CLASS_LABEL], display_weights=True)\n",
+    "\n",
+    "# Integrals converge pretty slowly, this takes time\n",
+    "# theoretical_acc = compute_accuracy(dirichlet_fc, opts={\"limit\": 75})[0]\n",
+    "# theoretical_ece = compute_ECE(dirichlet_fc, opts={\"limit\": 75})[0]\n",
+    "# print(f\"{theoretical_acc=} , {theoretical_ece=}\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Analytical results\n",
+    "\n",
+    "For top-class overconfident classifiers we have\n",
+    "\n",
+    "$ECE_i = \\int_{A_i} \\ (c_i - h_i(\\vec c)) \\cdot p(\\vec c)$\n",
+    "\n",
+    "$acc_i = \\int_{A_i} \\ h_i(\\vec c) \\cdot p(\\vec c)$\n",
+    "\n",
+    "In many relevant regimes, the DirichletFC can be approximately regarded as sufficiently confident.\n",
+    "This means we can approximate ECE and accuracy as:\n",
+    "\n",
+    "$ECE_i \\  \\lessapprox \\ \\int_{\\tilde A_i} \\ (c_i - h_i(\\vec c)) \\cdot p(\\vec c)$\n",
+    "\n",
+    "$acc_i \\ \\lessapprox \\ \\int_{\\tilde A_i} \\ h_i(\\vec c) \\cdot p(\\vec c)$\n",
+    "\n",
+    "We can explicitly calculate the first part of the ECE:\n",
+    "\n",
+    "$ \\int_{\\tilde A_i} \\ c_i \\cdot p(\\vec c) = \\frac{\\alpha_i}{\\alpha_0}\n",
+    "\\left(1 - (\\alpha_0-\\alpha_i) \\ \\beta(1/2;\\ \\alpha_i + 1, \\alpha_0-\\alpha_i) \\ \\binom{\\alpha_0}{\\alpha_i} \\right)$\n",
+    "\n",
+    "As expected, when $\\alpha_i \\rightarrow \\alpha_0$, this expression goes to one"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "The second part depends on the simplex automorphism $h$.\n",
+    "We can sort of compute it for the RestrictedPowerAut and for some MaxComponentSimplexAut.\n",
+    "However, both transforms seem to be rather on the pathological side of things..."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "print(\"mostly underestimating first two classes with RestrictedPowerSimplexAut\")\n",
+    "\n",
+    "transform = RestrictedPowerSimplexAut(np.array([2, 4]))\n",
+    "dirichlet_fc.set_simplex_automorphism(transform)\n",
+    "eval_stats = EvalStats(*dirichlet_fc.get_sample_arrays(n_samples))\n",
+    "\n",
+    "print(f\"Accuracy is {eval_stats.accuracy()}\")\n",
+    "print(f\"ECE is {eval_stats.expected_calibration_error()}\")\n",
+    "print(\"Theoretical approximation of ECE\")\n",
+    "print(eval_stats.expected_confidence() - eval_stats.accuracy())\n",
+    "eval_stats.plot_reliability_curves([0, 1, 2, EvalStats.TOP_CLASS_LABEL], display_weights=True)\n",
+    "\n",
+    "\n",
+    "# theoretical_acc = compute_accuracy(dirichlet_fc)[0]\n",
+    "# theoretical_ece = compute_ECE(dirichlet_fc)[0]\n",
+    "# print(f\"{theoretical_acc=} , {theoretical_ece=}\")"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "print(f\"\"\"\n",
+    "NOTE: here the ECE completely fails to converge to it's true, continuous value.\n",
+    "This is probably due to the binning-variance, see plots below with 500 bins.\n",
+    "The sharp peak in weights at the end certainly does not help convergence either.\n",
+    "\"\"\")\n",
+    "\n",
+    "eval_stats.set_bins(500)\n",
+    "eval_stats.plot_reliability_curves([EvalStats.TOP_CLASS_LABEL], display_weights=True)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## The Calibration Game\n",
+    "\n",
+    "Below are potential 5-classes classifiers that we will use in the calibration game.\n",
+    "They all have roughly the same accuracy but very differing ECEs, corresponding to\n",
+    "different difficulty settings for the game."
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "n_classes = 5\n",
+    "n_samples = 500000"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "print(\"hardest setting: accuracy 80, ECE 18\")\n",
+    "\n",
+    "exponents = np.array([0.05, 0.4, 0.1, 0.2, 0.1]) * 2/3\n",
+    "alpha = np.ones(5) * 1/150\n",
+    "\n",
+    "# exponents = np.ones(5) * 1/5\n",
+    "# alpha = np.ones(5) * 1/45\n",
+    "\n",
+    "dirichlet_fc = DirichletFC(n_classes, alpha=alpha)\n",
+    "transform = PowerLawSimplexAut(exponents)\n",
+    "dirichlet_fc.set_simplex_automorphism(transform)\n",
+    "eval_stats = EvalStats(*dirichlet_fc.get_sample_arrays(n_samples), bins=200)\n",
+    "\n",
+    "print(f\"Accuracy is {eval_stats.accuracy()}\")\n",
+    "print(f\"ECE is {eval_stats.expected_calibration_error()}\")\n",
+    "eval_stats.plot_reliability_curves([0, eval_stats.TOP_CLASS_LABEL], display_weights=True)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "print(\"medium setting: accuracy 80, ECE 10\")\n",
+    "\n",
+    "exponents = np.array([0.5, 1, 1, 1, 0.5]) * 1/1.8\n",
+    "alpha = np.array([0.5, 2, 3, 4, 5]) * 1/65\n",
+    "\n",
+    "n_samples = 300000\n",
+    "n_classes = 5\n",
+    "\n",
+    "\n",
+    "dirichlet_fc = DirichletFC(n_classes, alpha=alpha)\n",
+    "transform = PowerLawSimplexAut(exponents)\n",
+    "dirichlet_fc.set_simplex_automorphism(transform)\n",
+    "eval_stats = EvalStats(*dirichlet_fc.get_sample_arrays(n_samples), bins=200)\n",
+    "\n",
+    "print(f\"Accuracy is {eval_stats.accuracy()}\")\n",
+    "print(f\"ECE is {eval_stats.expected_calibration_error()}\")\n",
+    "eval_stats.plot_reliability_curves([4, eval_stats.TOP_CLASS_LABEL], display_weights=True)"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "print(\"mostly underestimating all classes (starting at 1/n_classes)\")\n",
+    "\n",
+    "\n",
+    "# accuracy 80, ECE 0\n",
+    "alpha = np.array([1, 2, 3, 2, 3]) * 1/19\n",
+    "\n",
+    "n_samples = 300000\n",
+    "n_classes = 5\n",
+    "\n",
+    "dirichlet_fc = DirichletFC(n_classes, alpha=alpha)\n",
+    "eval_stats = EvalStats(*dirichlet_fc.get_sample_arrays(n_samples))\n",
+    "\n",
+    "print(f\"Accuracy is {eval_stats.accuracy()}\")\n",
+    "print(f\"ECE is {eval_stats.expected_calibration_error()}\")\n",
+    "eval_stats.plot_reliability_curves([4, eval_stats.TOP_CLASS_LABEL], display_weights=True)\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/notebooks/test_notebooks.py b/notebooks/test_notebooks.py
new file mode 100644
index 0000000..d37fb2a
--- /dev/null
+++ b/notebooks/test_notebooks.py
@@ -0,0 +1,40 @@
+import logging
+import os
+
+import nbformat
+import pytest
+from nbconvert.preprocessors import ExecutePreprocessor
+
+NOTEBOOKS_DIR = "notebooks"
+DOCS_DIR = "docs"
+resources = {"metadata": {"path": NOTEBOOKS_DIR}}
+
+log = logging.getLogger(__name__)
+
+
+notebooks_to_ignore = ["evaluating_cal_methods.ipynb"]
+
+notebooks_to_test = [
+    file
+    for file in os.listdir(NOTEBOOKS_DIR)
+    if file.endswith(".ipynb") and file not in notebooks_to_ignore
+]
+
+
+@pytest.mark.parametrize("notebook", notebooks_to_test)
+def test_notebook(notebook):
+    notebook_path = os.path.join(NOTEBOOKS_DIR, notebook)
+    log.info(f"Reading jupyter notebook from {notebook_path}")
+    with open(notebook_path) as f:
+        nb = nbformat.read(f, as_version=4)
+    ep = ExecutePreprocessor(timeout=600)
+    with ep.setup_preprocessor(nb, resources=resources):
+        for i, cell in enumerate(nb["cells"]):
+            log.info(f"processing cell {i} from {notebook}")
+            ep.preprocess_cell(cell, resources=resources, cell_index=i)
+
+    # saving the executed notebook to docs
+    output_path = os.path.join(DOCS_DIR, notebook)
+    log.info(f"Saving executed notebook to {output_path} for documentation purposes")
+    with open(output_path, "w", encoding="utf-8") as f:
+        nbformat.write(nb, f)
diff --git a/notebooks/trained_models/lenet5.ckpt b/notebooks/trained_models/lenet5.ckpt
new file mode 100644
index 0000000..d46d9be
Binary files /dev/null and b/notebooks/trained_models/lenet5.ckpt differ
diff --git a/notebooks_needing_refactoring/fitting_fake_classifiers.ipynb b/notebooks_needing_refactoring/fitting_fake_classifiers.ipynb
new file mode 100644
index 0000000..ef2cf63
--- /dev/null
+++ b/notebooks_needing_refactoring/fitting_fake_classifiers.ipynb
@@ -0,0 +1,1317 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "070ccf34",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b885c90e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import logging\n",
+    "\n",
+    "from kyle.sampling.fake_clf import DirichletFC, MultiDirichletFC\n",
+    "from kyle.evaluation import EvalStats, compute_accuracy, compute_ECE, compute_expected_max\n",
+    "from kyle.transformations import *\n",
+    "from kyle.calibration.calibration_methods import TemperatureScaling\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "import torch.nn.init as init\n",
+    "import torchvision\n",
+    "import torchvision.datasets as datasets\n",
+    "import torchvision.transforms as transforms\n",
+    "import pytorch_lightning as pl\n",
+    "\n",
+    "import scipy.stats\n",
+    "import scipy.optimize\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef78eb40",
+   "metadata": {},
+   "source": [
+    "# Realistic Fake Classifiers\n",
+    "\n",
+    "It's good to have a model of what a realistic Fake Classifier should look like.\n",
+    "\n",
+    "Probably the simplest model for the fake classifier confidence vector distribution is the **Dirichlet Fake Classifier**:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "C \\sim Dirichlet(\\alpha_1, \\alpha_2, \\alpha_3, ...)\n",
+    "\\end{equation}\n",
+    "\n",
+    "This classifier has a total of 'num_classes' parameters\n",
+    "\n",
+    "However, this model is possibly a bit too simple as it only has a single local maximum in the distribution. A realistic fake classifier might for example have multiple local maxima in each of the corners of the simplex, i.e. it generally is very confident in its prediction and only very rarely uncertain (center of simplex). Something similar can actually be achieved using the Dirichlet distribution by setting all the parameters $\\alpha_n < 1$. This pushes the distribution out into the corners, BUT however also out onto the sides of the simplex, which is not quite what we want. The center of a side of the simplex corresponds to a confidence vector $\\vec\\alpha = (1/\\text{num_classes}-1, 1/\\text{num_classes}-1, ..., 1/\\text{num_classes}-1, 0)$, i.e. very uncertain in all but one of the classes.\n",
+    "\n",
+    "Therefore we also consider two other Fake Classifiers that can have multiple local maxima, one in each of the corners, and therefore possibly represent real neural networks better:\n",
+    "\n",
+    "Firstly the **Multi-Dirichlet Fake Classifier**:\n",
+    "\n",
+    "\\begin{align}\n",
+    "K & \\sim Catgeorical(p_1, p_2, p_3, ...) \\\\\n",
+    "C & \\sim Dirichlet_k(\\sigma_k\\cdot[1, 1, ..., 1, \\alpha_k, 1, ...])\n",
+    "\\end{align}\n",
+    "\n",
+    "This classifier has a total of '3 x num_classes' parameters\n",
+    "\n",
+    "i.e. we first draw from a K-categorical distribution and based on the result we then draw from one of K Dirichlet distributions. Each of the K Dirichlet distributions has two parameters $\\sigma$ and $\\alpha_k$ which represent the width and position of the local maximum in the k-th corner of the simplex.\n",
+    "\n",
+    "Note: The pdf of this mixture distribution will be a weighted sum of the individual dirichlet distributions\n",
+    "\n",
+    "Secondly the **Multi-Gaussian Fake Classifier**:\n",
+    "\n",
+    "K-Categorical followed by one of K Gaussians followed by softmax\n",
+    "\n",
+    "(probably doesn't make too much sense as Multi-Gaussian pdf is analytically intractable due to the softmax transformation)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "854471a8",
+   "metadata": {},
+   "source": [
+    "**Note**:\n",
+    "\n",
+    "When talking about the simple dirichlet FC $\\vec{\\alpha}$ refers to the vector of $\\alpha$ parameters of the dirichlet distribution.\n",
+    "\n",
+    "When talking about the Multi-Dirichlet FC $\\vec{\\alpha_k}$ refers to the vector of $\\alpha_k$ parameters, and should not be confused to be a parameter vector of a single dirichlet."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "05413ca4",
+   "metadata": {},
+   "source": [
+    "In order to get an actually realistic Fake Classifier we use these three Fake Classifier models and fit their distributions to the observed confidence vector distributions for a couple of different neural networks.\n",
+    "\n",
+    "In this case we use:\n",
+    "\n",
+    "**LeNet 5** on CIFAR 10\n",
+    "\n",
+    "**ResNet 20** on CIFAR 10\n",
+    "\n",
+    "**ResNet 110** on CIFAR 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b7881576",
+   "metadata": {},
+   "source": [
+    "## Preparing Data and Neural Networks:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "606847e3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#load cifar dataset\n",
+    "#normalizarion also from https://github.com/akamaster/pytorch_resnet_cifar10\n",
+    "normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])\n",
+    "\n",
+    "cifar_train_set = datasets.CIFAR10(os.getcwd(), train=True, download=True,\n",
+    "        transform=transforms.Compose([transforms.ToTensor(), normalize]))\n",
+    "cifar_test_set = datasets.CIFAR10(os.getcwd(), train=False, download=True,\n",
+    "                transform=transforms.Compose([transforms.ToTensor(), normalize]))\n",
+    "\n",
+    "cifar_train = torch.utils.data.DataLoader(cifar_train_set, batch_size=4, shuffle=True, num_workers=1)\n",
+    "cifar_test = torch.utils.data.DataLoader(cifar_test_set, batch_size=4, shuffle=False, num_workers=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b17a43d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Small simple LeNet5 for CIFAR 10 classification\n",
+    "\n",
+    "class lenet5(pl.LightningModule):\n",
+    "\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self.conv1 = nn.Conv2d(3, 6, 5)\n",
+    "        self.conv2 = nn.Conv2d(6, 16, 5)\n",
+    "        self.fc1 = nn.Linear(16 * 5 * 5, 120)\n",
+    "        self.fc2 = nn.Linear(120, 84)\n",
+    "        self.fc3 = nn.Linear(84, 10)\n",
+    "        \n",
+    "    def forward(self, x):\n",
+    "        x = F.max_pool2d(F.relu(self.conv1(x)), 2)\n",
+    "        x = F.max_pool2d(F.relu(self.conv2(x)), 2)\n",
+    "        x = x.view(x.size(0), -1)\n",
+    "        x = F.relu(self.fc1(x))\n",
+    "        x = F.relu(self.fc2(x))\n",
+    "        x = self.fc3(x)\n",
+    "        return x\n",
+    "\n",
+    "    def training_step(self, batch, batch_idx):\n",
+    "        x, target = batch\n",
+    "        output = self(x)\n",
+    "        loss = F.cross_entropy(output, target)\n",
+    "        return loss\n",
+    "    \n",
+    "    def validation_step(self, batch, batch_idx):\n",
+    "        x, target = batch\n",
+    "        output = self(x)\n",
+    "        loss = F.cross_entropy(output, target)\n",
+    "        self.log('val_loss', loss, on_step=False, on_epoch=True, prog_bar=True)\n",
+    "            \n",
+    "    def configure_optimizers(self):\n",
+    "        optimizer = torch.optim.SGD(self.parameters(), lr=0.001, momentum=0.9)\n",
+    "        return optimizer\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83698ff2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Proper implementation of ResNet20 for Cifar10. Pytorch only has ResNets for ImageNet which\n",
+    "#differ in number of parameters\n",
+    "#Code taken from: https://github.com/akamaster/pytorch_resnet_cifar10\n",
+    "\n",
+    "def _weights_init(m):\n",
+    "    classname = m.__class__.__name__\n",
+    "    #print(classname)\n",
+    "    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):\n",
+    "        init.kaiming_normal_(m.weight)\n",
+    "\n",
+    "class LambdaLayer(nn.Module):\n",
+    "    def __init__(self, lambd):\n",
+    "        super(LambdaLayer, self).__init__()\n",
+    "        self.lambd = lambd\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.lambd(x)\n",
+    "\n",
+    "\n",
+    "class BasicBlock(nn.Module):\n",
+    "    expansion = 1\n",
+    "\n",
+    "    def __init__(self, in_planes, planes, stride=1, option='A'):\n",
+    "        super(BasicBlock, self).__init__()\n",
+    "        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n",
+    "        self.bn1 = nn.BatchNorm2d(planes)\n",
+    "        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn2 = nn.BatchNorm2d(planes)\n",
+    "\n",
+    "        self.shortcut = nn.Sequential()\n",
+    "        if stride != 1 or in_planes != planes:\n",
+    "            if option == 'A':\n",
+    "                \"\"\"\n",
+    "                For CIFAR10 ResNet paper uses option A.\n",
+    "                \"\"\"\n",
+    "                self.shortcut = LambdaLayer(lambda x:\n",
+    "                                            F.pad(x[:, :, ::2, ::2], (0, 0, 0, 0, planes//4, planes//4), \"constant\", 0))\n",
+    "            elif option == 'B':\n",
+    "                self.shortcut = nn.Sequential(\n",
+    "                     nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),\n",
+    "                     nn.BatchNorm2d(self.expansion * planes)\n",
+    "                )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        out = F.relu(self.bn1(self.conv1(x)))\n",
+    "        out = self.bn2(self.conv2(out))\n",
+    "        out += self.shortcut(x)\n",
+    "        out = F.relu(out)\n",
+    "        return out\n",
+    "\n",
+    "\n",
+    "class ResNet(nn.Module):\n",
+    "    def __init__(self, block, num_blocks, num_classes=10):\n",
+    "        super(ResNet, self).__init__()\n",
+    "        self.in_planes = 16\n",
+    "\n",
+    "        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)\n",
+    "        self.bn1 = nn.BatchNorm2d(16)\n",
+    "        self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)\n",
+    "        self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)\n",
+    "        self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)\n",
+    "        self.linear = nn.Linear(64, num_classes)\n",
+    "\n",
+    "        self.apply(_weights_init)\n",
+    "\n",
+    "    def _make_layer(self, block, planes, num_blocks, stride):\n",
+    "        strides = [stride] + [1]*(num_blocks-1)\n",
+    "        layers = []\n",
+    "        for stride in strides:\n",
+    "            layers.append(block(self.in_planes, planes, stride))\n",
+    "            self.in_planes = planes * block.expansion\n",
+    "\n",
+    "        return nn.Sequential(*layers)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        out = F.relu(self.bn1(self.conv1(x)))\n",
+    "        out = self.layer1(out)\n",
+    "        out = self.layer2(out)\n",
+    "        out = self.layer3(out)\n",
+    "        out = F.avg_pool2d(out, out.size()[3])\n",
+    "        out = out.view(out.size(0), -1)\n",
+    "        out = self.linear(out)\n",
+    "        return out\n",
+    "    \n",
+    "\n",
+    "def resnet20():\n",
+    "    return ResNet(BasicBlock, [3, 3, 3])\n",
+    "\n",
+    "\n",
+    "def resnet32():\n",
+    "    return ResNet(BasicBlock, [5, 5, 5])\n",
+    "\n",
+    "\n",
+    "def resnet44():\n",
+    "    return ResNet(BasicBlock, [7, 7, 7])\n",
+    "\n",
+    "\n",
+    "def resnet56():\n",
+    "    return ResNet(BasicBlock, [9, 9, 9])\n",
+    "\n",
+    "\n",
+    "def resnet110():\n",
+    "    return ResNet(BasicBlock, [18, 18, 18])\n",
+    "\n",
+    "\n",
+    "def resnet1202():\n",
+    "    return ResNet(BasicBlock, [200, 200, 200])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e146f71f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Train a LeNet5\n",
+    "#Load selftrained LeNet 5 and pretrained Resnet20 and Resnet110 (don't have a dedicated GPU at hand D:)\n",
+    "#Pretrained nets taken from https://github.com/akamaster/pytorch_resnet_cifar10\n",
+    "\n",
+    "#selftrained_lenet5 = lenet5()\n",
+    "#checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor='val_loss', save_top_k=1, save_last=True)\n",
+    "#trainer = pl.Trainer(max_epochs=20, logger=False, checkpoint_callback=checkpoint_callback)\n",
+    "#trainer.fit(selftrained_lenet5, cifar_train, cifar_test)\n",
+    "\n",
+    "selftrained_lenet5 = lenet5.load_from_checkpoint('./trained_models/lenet5.ckpt')\n",
+    "\n",
+    "pretrained_resnet20 = resnet20()\n",
+    "pretrained_resnet110 = resnet110()\n",
+    "\n",
+    "pretrained_resnet20_dict = torch.load('./trained_models/resnet20-12fca82f.th',\n",
+    "                               map_location=torch.device('cpu'))['state_dict']\n",
+    "pretrained_resnet20_dict = {key.replace(\"module.\", \"\"): value for key, value in pretrained_resnet20_dict.items()}\n",
+    "pretrained_resnet20.load_state_dict(pretrained_resnet20_dict)\n",
+    "\n",
+    "pretrained_resnet110_dict = torch.load('./trained_models/resnet110-1d1ed7c2.th',\n",
+    "                               map_location=torch.device('cpu'))['state_dict']\n",
+    "pretrained_resnet110_dict = {key.replace(\"module.\", \"\"): value for key, value in pretrained_resnet110_dict.items()}\n",
+    "pretrained_resnet110.load_state_dict(pretrained_resnet110_dict)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d0c24ab5",
+   "metadata": {},
+   "source": [
+    "## Set which neural net to use here"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00658d30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "neural_net = selftrained_lenet5\n",
+    "#neural_net = pretrained_resnet20\n",
+    "#neural_net = pretrained_resnet110"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6b46a5c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Get NN predictions on CIFAR10 test set\n",
+    "cifar_test_full = torch.utils.data.DataLoader(cifar_test_set, batch_size=len(cifar_test_set),\n",
+    "                                              shuffle=False, num_workers=2)\n",
+    "images, labels = next(iter(cifar_test_full))\n",
+    "\n",
+    "neural_net.eval()\n",
+    "with torch.no_grad():\n",
+    "    logits = neural_net(images)\n",
+    "    prob = F.softmax(logits, dim=1)\n",
+    "    _, predicted = torch.max(prob, dim=1)\n",
+    "    print(f'NLL = {F.cross_entropy(logits, labels)}')\n",
+    "    print(f'accuracy = {(predicted == labels).sum().item() / labels.size(0)}')\n",
+    "    \n",
+    "gt_labels = labels.numpy()\n",
+    "confidences = prob.numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5992bbb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gt_labels_copy = gt_labels.copy()\n",
+    "confidences_copy = confidences.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee930ec0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gt_labels = gt_labels_copy.copy()\n",
+    "confidences = confidences_copy.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be9e8a8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f'num non normalized confidence vectors = {np.sum((np.sum(confidences, axis=1) - 1) >= 1e-10)}')\n",
+    "\n",
+    "#confidences are not perfectly normalized due to floating point error\n",
+    "#scipy.stats.dirichlet.pdf is very picky about normalization\n",
+    "#convert confidences to float64 first for better/more accurate normalization\n",
+    "\n",
+    "confidences = np.array(confidences, dtype='float64')\n",
+    "confidences = confidences / np.sum(confidences, axis=1)[:,None]\n",
+    "print(f'num non normalized confidence vectors = {np.sum((np.sum(confidences, axis=1) - 1) >= 1e-10)}')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b4b77d1",
+   "metadata": {},
+   "source": [
+    "# Fitting Fake Classifiers using MLE\n",
+    "\n",
+    "**All of this is relatively simple/rudimemtary. 'Proper' Dirichlet fitting would involve something closer to what is described in https://epub.wu.ac.at/4077/1/Report125.pdf.**\n",
+    "\n",
+    "(Note: Any references to fitted results/distributions/graphs in the following sections used the LeNet5 as the 'real' neural network)\n",
+    "\n",
+    "Having gotten the confidences of our neural net on the CIFAR 10 test set we can now try and fit an appropiate fake classifier to them. This can be done quite easily for the Dirichlet and Multi-Dirichlet FC's using MLE, as we have relatively simple expressions for the distribution pdf's. The Multi-Gaussian FC is not as easy, as the softmax function complicates the fake classifier's pdf. (It would be necessary to invert the softmax function, which is only possible up to an additive constant. As a result the Multi-Gaussian fake classifier's pdf will be an integral over a gaussian mixture model's pdf.)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9a40b2b1",
+   "metadata": {},
+   "source": [
+    "## 'Normal' MLE Fitting\n",
+    "\n",
+    "MLE is probably the easiest and simplest approach when the fake classifier's pdf is known exactly. We calculate the negative log likelihood of our neural net's confidence vector distribution under the assumption of either a Dirichlet or Multi-Dirichlet distribution. Using one of scipy's many minimization algorithms/functions we can then find the set of parameters of the fake classifier that maximize the log-likelihood/minimize the negative log-likelihood.\n",
+    "\n",
+    "We have introduced this fitting as ``.fit()`` method for the FakeClassifier Classes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d1618911",
+   "metadata": {},
+   "source": [
+    "Right away we run into a problem, divergences:\n",
+    "\n",
+    "On the sides and corners of the simplex, when one or many components of the confidence vector are zero, the log-likelihood diverges. If $c_i = 0$ and the corrseponding dirichlet parameter $a_i < 1$ the pdf and the log-likelihood diverge $p(c_i = 0) \\rightarrow + \\infty$. If, however, $a_i > 1$ then $p(c_i)=0$ which will again lead to a divergence in the log-likelihood.\n",
+    "\n",
+    "Unfortunately a lot of the neural networks are very confident and often predict class confidences near/at $c_i=0$. To mitigate this divergence problem somewhat we can rescale the confidences before fitting according to:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "\\vec{c} = \\frac{\\vec{c}(N-1) + 1/\\text{num_classes}}{N}\n",
+    "\\end{equation}\n",
+    "\n",
+    "where N is the number of samples (see https://epub.wu.ac.at/4077/1/Report125.pd). All of the following fits use this rescaling. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e1499c9b",
+   "metadata": {},
+   "source": [
+    "As always with fitting the initial guesses and bounds are important:\n",
+    "\n",
+    "As discussed at the start a somewhat alright fake classifier can possibly be achieved by using a simple Dirichlet Fake Classifier with alpha parameters $\\alpha_n < 1$. For fitting the DirichletFC appropiate initial guesses and bounds might therefore be $\\vec{\\alpha}_\\text{init} = (1,1,1,1,...)$  and $\\alpha_\\min, \\alpha_\\max = (0.0001, \\text{None})$\n",
+    "\n",
+    "As dicussed at the start the reasoning behind the Multi-Dirichlet FC is that each separate Dirichlet can be used to create a local maximum in one of the corners of the simplex. This only works if the full alpha vector of each dirichlet has all entries $>1$ (if any entry is $<1$ a local maximum does not exist), which means for each Dirichlet we need $\\alpha_k >1$ and $\\sigma_k>1$. We also expect the maxima to be very 'squished' into the corners. i.e. $\\alpha_k$ to be large. For fitting the Multi-Dirichlet FC appropate initial guesses and bounds might therefore be $\\vec{\\alpha}_{k,\\text{init}} = (10,10,10,10,...)$ $\\vec{\\sigma}_\\text{init} = (2,2,2,2,...)$ and $\\alpha_{k,\\min}, \\alpha_{k,\\max} = (1, \\text{None})$ $\\sigma_\\min, \\sigma_\\max = (1, \\text{None})$"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c4423350",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Save 'EvalStats' and NLL of all the fitted FC's for easier comparison later\n",
+    "save_fc_eval = []\n",
+    "save_NLL     = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6331f17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Fit Dirichlet FC to the test set confidence vector distributions using MLE fitting\n",
+    "num_classes = confidences.shape[1]\n",
+    "\n",
+    "Dir_FC = DirichletFC(num_classes)\n",
+    "mle_results = Dir_FC.fit(confidences, initial_alpha=np.ones(num_classes), alpha_bounds=(0.0001, None))\n",
+    "\n",
+    "save_NLL.append(mle_results.fun)\n",
+    "print(f'final NLL = {mle_results.fun}')\n",
+    "print(f'Fitted parameters = {Dir_FC.alpha}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fc9475c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Fit Multi-Dirichlet FC to the test set confidence vector distributions using MLE fitting\n",
+    "num_classes = confidences.shape[1]\n",
+    "\n",
+    "MultiDir_FC = MultiDirichletFC(num_classes)\n",
+    "mle_results = MultiDir_FC.fit(confidences,\n",
+    "                            initial_parameters=np.array([10, 2, 1/num_classes]),\n",
+    "                            parameter_bounds=[(1,None), (1,None), (0,1)], \n",
+    "                            simplified_fitting=False)\n",
+    "\n",
+    "save_NLL.append(mle_results.fun)\n",
+    "print(f'final NLL = {mle_results.fun}')\n",
+    "print(f'Fitted parameters = {MultiDir_FC.get_parameters()}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cbde0a7e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "#visual comparison of neural network and FC distributions\n",
+    "Dir_FC_eval = EvalStats(*Dir_FC.get_sample_arrays(50000))\n",
+    "MultiDir_FC_eval = EvalStats(*MultiDir_FC.get_sample_arrays(50000))\n",
+    "NNet_eval = EvalStats(gt_labels, confidences)\n",
+    "\n",
+    "fig, axs = plt.subplots(3,3, figsize=(25,25))\n",
+    "plt.suptitle('Dirichlet (left) vs Multi-Dirichlet (middle) vs Real NN (right)', fontsize=25)\n",
+    "for i, class_ in enumerate([\"top_class\", 1, 2]):\n",
+    "    plt.sca(axs[i,0])\n",
+    "    Dir_FC_eval.plot_confidence_distributions([class_], new_fig=False)\n",
+    "    plt.sca(axs[i,1])\n",
+    "    MultiDir_FC_eval.plot_confidence_distributions([class_], new_fig=False)\n",
+    "    plt.sca(axs[i,2])\n",
+    "    NNet_eval.plot_confidence_distributions([class_], new_fig=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b8dd436",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_fc_eval.append(Dir_FC_eval)\n",
+    "save_fc_eval.append(MultiDir_FC_eval)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9ad12c50",
+   "metadata": {},
+   "source": [
+    "Above we have plotted the top class confidence distribution as well as the marginal confidence distributions for the actual neural network and our fitted fake classifiers (the full 10D confidence vector distribution is unfortuanetly a bit difficult to visualise). This allows us to visually inspect how well our fake classifiers really reflect the true confidence distribution of our classifier. The left, middle and right columns correspond to the confidence distributions of the dirichlet fake clsssifier, the multi-dirichlet faske classifier and the real neural network respectively.\n",
+    "\n",
+    "As expected the simple dirichlet FC does not work well. It doesn't capture the multimodal nature of the true marginal distribution nor the high frequency of p=1.0 confidences in the top class confidence distributions.\n",
+    "\n",
+    "The Multi-dirichlet FC seems to be a bit better capturing both of these qualities. However the maxima of the distributions don't lie far enough at the extremes, which is likely due to the fact that the local maximum of a dirichlet only lies exactly on the corner in the limit of $\\alpha_k \\rightarrow \\infty$\n",
+    "\n",
+    "Also of note is that the Multi-Dirichlet FC always tries to fit $\\sigma < 1$, always reaching the lower bound of $\\sigma = 1$ which we have imposed."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "73c9cb54",
+   "metadata": {},
+   "source": [
+    "## 'Sufficiently Confident/Class Split' MLE Fitting \n",
+    "\n",
+    "Given the slight difficulties with fitting a 30 parameter Multi-Dirichlet model we try to simplify the fitting a bit. This can be done by assuming a scenario very similar to that of sufficiently confident fake classifiers. We assume that if we draw a confidence vector from the k'th dirichlet distribution, then that confidence vector always predicts the class k. That means:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "P(\\text{argmax}(\\vec{c}) = k | \\vec{c} \\sim \\text{Dir}_k) \\simeq 1 \\text{  or equivalently  } P(\\text{argmax}(\\vec{c}) = k) \\simeq p_k \n",
+    "\\end{equation}\n",
+    "\n",
+    "where $p_k$ is the weight of the k'th dirichlet distribution. This is of course only a good approximation if $P(\\text{argmax}(\\vec{c}) \\neq k | \\vec{c} \\sim \\text{Dir}_k) \\simeq 0$, i.e. if the individual Dirichlet distributions are sufficiently concentrated in their respective corner of the simplex with only negligible probability mass near the other corners of the simplex.\n",
+    "\n",
+    "From the marginal class confidence distribution of the real neural network, we can see that this might be a very good approximation, as the class confidences always either lie near $c_k = 1.0$ or $c_k = 0.0$."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cf0d8482",
+   "metadata": {},
+   "source": [
+    "This approximation makes two simplifications possible:\n",
+    "\n",
+    "1. We no longer have to fit the distribution weights $p_k$. Instead they can simply be read off from the real neural networks predicted class probabilities. This reduces the number of fitted parameters from 30 to 20\n",
+    "\n",
+    "2. We can now fit each Dirichlet distribution from the Multi-Dirichlet Fake classifier individually to the subset of the data for which $\\text{argmax}(\\vec{c}) = k$. This reduces the problem further from a simultaneous fit of 20 parameters to 10 fits of 2 parameters each.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4ac25abe",
+   "metadata": {},
+   "source": [
+    "We proceed in the following way: First we split the real neural networks confidence distribution by predicted class into 10 subsets. Using the relative size of these subest we estimate the distribution weights of the dirichlet distributions in our Multi-Dirichlet FC. We then fit each dirichlet distribution of our Multi-Dirichlet FC separately to its corresponding data set, by minimizing the negative log likelihood as before.\n",
+    "\n",
+    "This simplified fitting procedure can be called using ``.fit(..., simplified_fitting=True)``."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36bbe734",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Fit Multi-Dirichlet FC to the test set confidence vector distributions using above simplification\n",
+    "num_classes = confidences.shape[1]\n",
+    "\n",
+    "MultiDir_FC = MultiDirichletFC(num_classes)\n",
+    "mle_results = MultiDir_FC.fit(confidences,\n",
+    "                            initial_parameters=np.array([10, 2]),\n",
+    "                            parameter_bounds=[(1,None), (1,None)], \n",
+    "                            simplified_fitting=True)\n",
+    "\n",
+    "save_NLL.append(np.sum([k_result.fun for k_result in mle_results]))\n",
+    "print(f'final NLL = {np.sum([k_result.fun for k_result in mle_results])}')\n",
+    "print('alpha = {}\\n\\nsigma = {}\\n\\ndistribution_weights = {}'.format(*MultiDir_FC.get_parameters()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8c2c84a",
+   "metadata": {},
+   "source": [
+    "Similar to above we can now visiually compare the marginal and top class confidence distributions of our fake classifier and the real neural network:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53399344",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "MultiDir_FC_eval = EvalStats(*MultiDir_FC.get_sample_arrays(50000))\n",
+    "NNet_eval = EvalStats(gt_labels, confidences)\n",
+    "\n",
+    "fig, axs = plt.subplots(3,2, figsize=(10,15))\n",
+    "plt.suptitle('Simplified Multi-Dirichlet (left) vs Real NN (right)', fontsize=15)\n",
+    "for i, class_ in enumerate([\"top_class\", 1, 2]):\n",
+    "    plt.sca(axs[i,0])\n",
+    "    MultiDir_FC_eval.plot_confidence_distributions([class_], new_fig=False)\n",
+    "    plt.sca(axs[i,1])\n",
+    "    NNet_eval.plot_confidence_distributions([class_], new_fig=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "261f2007",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_fc_eval.append(MultiDir_FC_eval)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6fd8860",
+   "metadata": {},
+   "source": [
+    "This doesn't really seem to work much better than fitting all 30 parameters at the same time. It actually looks qualitativley worse. This is likely due to the fact that the approximation made to 'derive' this fitting procedure is simply not very good in the case of LeNet5. However, with ResNet20 and ResNet110 this approximation becomes much better and the 'class split' fitting produces almost exactly the same results.\n",
+    "\n",
+    "Most importatntly though, it is much less computationally expensive."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9654ff88",
+   "metadata": {},
+   "source": [
+    "## Sigma < 1\n",
+    "\n",
+    "Interestingly enough the simplified fitting runs into the same problem of always trying to fit very small $\\sigma_k < 1$, even though the sensible/intuitve choice would be $\\sigma_k > 1$. This indicates that maybe our sensible/intuitve choice of wanting $\\sigma_k > 1$ isn't actually that sensible. As explained earlier the reasoning behind the Multi-Dirichlet model is that each Dirichlet distribution is used to create a local maximum in one of the corners of the simplex, with $\\alpha_k$ determining how squished into the corner and $\\sigma_k$ how broad the local maximum is. The requirement of $\\sigma_k > 1$ and $\\alpha_k > 1$ arose, because if any parameter of the Dirichlet is $< 1$ a local maximum does not exist and our whole reasoning behind introducing the Multi-Dirichlet fails."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "67c5050c",
+   "metadata": {},
+   "source": [
+    "However, as it turns out a reasonable parameter choice might also be $\\sigma_k < 1$, with $\\alpha_k \\cdot \\sigma_k > 1$. This would mean that all parameters of the dirichlet, with exception of the k'th entry, are $ < 1$. While a local maximum no longer exists (we now have divergences at the sides and corners) this can still lead to an equivalent situation with most of the probability mass concentrated in one corner of the simplex. Below is a quick and dirty visualisation of this:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c360ef1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig, ax = plt.subplots(1, 2, figsize=(12,5))\n",
+    "\n",
+    "vis_dir_FC = DirichletFC(3)\n",
+    "\n",
+    "vis_dir_FC.set_alpha([0.2, 20, 0.2])\n",
+    "_, vis_confidences = vis_dir_FC.get_sample_arrays(10000)\n",
+    "\n",
+    "ax[0].scatter(vis_confidences[:,0], vis_confidences[:,1], s=0.1)\n",
+    "ax[0].plot([0,1], [1,0], 'r--')\n",
+    "ax[0].plot([0,1], [0,0], 'r--')\n",
+    "ax[0].plot([0,0], [0,1], 'r--')\n",
+    "ax[0].set_xlim([-0.2,1.2])\n",
+    "ax[0].set_ylim([-0.2,1.2])\n",
+    "\n",
+    "vis_dir_FC.set_alpha([2, 50, 2])\n",
+    "_, vis_confidences = vis_dir_FC.get_sample_arrays(10000)\n",
+    "\n",
+    "ax[1].scatter(vis_confidences[:,0], vis_confidences[:,1], s=0.1)\n",
+    "ax[1].plot([0,1], [1,0], 'r--')\n",
+    "ax[1].plot([0,1], [0,0], 'r--')\n",
+    "ax[1].plot([0,0], [0,1], 'r--')\n",
+    "ax[1].set_xlim([-0.2,1.2])\n",
+    "ax[1].set_ylim([-0.2,1.2])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "176821eb",
+   "metadata": {},
+   "source": [
+    "Below we have repeated both the direct Multi-Dirichlet FC fitting as well as the 'class split' fitting, but this time with a lower bound for sigma of $\\sigma_\\text{lower} = 0.001$. Allowing for small $\\sigma$ seems to improve things quite a bit."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "554757d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Fit Multi-Dirichlet FC allowing sigma < 1\n",
+    "num_classes = confidences.shape[1]\n",
+    "\n",
+    "MultiDir_FC = MultiDirichletFC(num_classes)\n",
+    "mle_results = MultiDir_FC.fit(confidences,\n",
+    "                            initial_parameters=np.array([10, 2, 1/num_classes]),\n",
+    "                            parameter_bounds=[(1,None), (0.001,None), (0,1)], \n",
+    "                            simplified_fitting=False)\n",
+    "\n",
+    "save_NLL.append(mle_results.fun)\n",
+    "print(f'final NLL = {mle_results.fun}')\n",
+    "print('alpha = {}\\n\\nsigma = {}\\n\\ndistribution_weights = {}'.format(*MultiDir_FC.get_parameters()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e0e847a",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "MultiDir_FC_eval = EvalStats(*MultiDir_FC.get_sample_arrays(50000))\n",
+    "NNet_eval = EvalStats(gt_labels, confidences)\n",
+    "\n",
+    "fig, axs = plt.subplots(3,2, figsize=(10,15))\n",
+    "plt.suptitle('Multi-Dirichlet w/ sigma < 1 (left) vs Real NN (right)', fontsize=15)\n",
+    "for i, class_ in enumerate([\"top_class\", 1, 2]):\n",
+    "    plt.sca(axs[i,0])\n",
+    "    MultiDir_FC_eval.plot_confidence_distributions([class_], new_fig=False)\n",
+    "    plt.sca(axs[i,1])\n",
+    "    NNet_eval.plot_confidence_distributions([class_], new_fig=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57486d89",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_fc_eval.append(MultiDir_FC_eval)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44cde78e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Fit Multi-Dirichlet FC allowing sigma < 1 with simplified fitting\n",
+    "num_classes = confidences.shape[1]\n",
+    "\n",
+    "MultiDir_FC = MultiDirichletFC(num_classes)\n",
+    "mle_results = MultiDir_FC.fit(confidences,\n",
+    "                            initial_parameters=np.array([10, 2]),\n",
+    "                            parameter_bounds=[(1,None), (0.001,None)], \n",
+    "                            simplified_fitting=True)\n",
+    "\n",
+    "save_NLL.append(np.sum([k_result.fun for k_result in mle_results]))\n",
+    "print(f'final NLL = {np.sum([k_result.fun for k_result in mle_results])}')\n",
+    "print('alpha = {}\\n\\nsigma = {}\\n\\ndistribution_weights = {}'.format(*MultiDir_FC.get_parameters()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e783d4b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "MultiDir_FC_eval = EvalStats(*MultiDir_FC.get_sample_arrays(50000))\n",
+    "NNet_eval = EvalStats(gt_labels, confidences)\n",
+    "\n",
+    "fig, axs = plt.subplots(3,2, figsize=(10,15))\n",
+    "plt.suptitle('Simplified Multi-Dirichlet w/ sigma < 1 (left) vs Real NN (right)', fontsize=15)\n",
+    "for i, class_ in enumerate([\"top_class\", 1, 2]):\n",
+    "    plt.sca(axs[i,0])\n",
+    "    MultiDir_FC_eval.plot_confidence_distributions([class_], new_fig=False)\n",
+    "    plt.sca(axs[i,1])\n",
+    "    NNet_eval.plot_confidence_distributions([class_], new_fig=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "50b49390",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_fc_eval.append(MultiDir_FC_eval)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f2a6a90a",
+   "metadata": {},
+   "source": [
+    "## Conclusion:\n",
+    "\n",
+    "The confidence distributions have been aggregated here again for easier comparison."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2674d5a",
+   "metadata": {},
+   "source": [
+    "### Real NN's Confidence Distribution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89c824e0",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "fig, axs = plt.subplots(1,2, figsize=(12,5))\n",
+    "plt.axes(axs[0])\n",
+    "NNet_eval.plot_confidence_distributions([\"top_class\"], new_fig = False)\n",
+    "plt.axes(axs[1])\n",
+    "NNet_eval.plot_confidence_distributions([0], new_fig = False)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55c88e7c",
+   "metadata": {},
+   "source": [
+    "### Dirichlet FC Confidence Distribution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9fec3e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f'final NLL = {save_NLL[0]}')\n",
+    "fig, axs = plt.subplots(1,2, figsize=(12,5))\n",
+    "plt.axes(axs[0])\n",
+    "save_fc_eval[0].plot_confidence_distributions([\"top_class\"], new_fig = False)\n",
+    "plt.axes(axs[1])\n",
+    "save_fc_eval[0].plot_confidence_distributions([0], new_fig = False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33b2c2ec",
+   "metadata": {},
+   "source": [
+    "### Directly Fitted Multi Dirichlet FC Confidence Distribution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1b58dd1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f'final NLL = {save_NLL[1]}')\n",
+    "fig, axs = plt.subplots(1,2, figsize=(12,5))\n",
+    "plt.axes(axs[0])\n",
+    "save_fc_eval[1].plot_confidence_distributions([\"top_class\"], new_fig = False)\n",
+    "plt.axes(axs[1])\n",
+    "save_fc_eval[1].plot_confidence_distributions([0], new_fig = False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e2776ab",
+   "metadata": {},
+   "source": [
+    "### Class Split Fitted Multi Dirichlet FC Confidence Distribution:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4690f7f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f'final NLL = {save_NLL[2]}')\n",
+    "fig, axs = plt.subplots(1,2, figsize=(12,5))\n",
+    "plt.axes(axs[0])\n",
+    "save_fc_eval[2].plot_confidence_distributions([\"top_class\"], new_fig = False)\n",
+    "plt.axes(axs[1])\n",
+    "save_fc_eval[2].plot_confidence_distributions([0], new_fig = False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "601d6c0f",
+   "metadata": {},
+   "source": [
+    "### Directly Fitted Multi Dirichlet FC Confidence Distribution, sigma < 1:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91fa9fab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f'final NLL = {save_NLL[3]}')\n",
+    "fig, axs = plt.subplots(1,2, figsize=(12,5))\n",
+    "plt.axes(axs[0])\n",
+    "save_fc_eval[3].plot_confidence_distributions([\"top_class\"], new_fig = False)\n",
+    "plt.axes(axs[1])\n",
+    "save_fc_eval[3].plot_confidence_distributions([0], new_fig = False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "728487c5",
+   "metadata": {},
+   "source": [
+    "### Class Split Fitted Multi Dirichlet FC Confidence Distribution, sigma < 1:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2836fd90",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f'final NLL = {save_NLL[4]}')\n",
+    "fig, axs = plt.subplots(1,2, figsize=(12,5))\n",
+    "plt.axes(axs[0])\n",
+    "save_fc_eval[4].plot_confidence_distributions([\"top_class\"], new_fig = False)\n",
+    "plt.axes(axs[1])\n",
+    "save_fc_eval[4].plot_confidence_distributions([0], new_fig = False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8a665cc1",
+   "metadata": {},
+   "source": [
+    "\n",
+    "Comparing how well different models fit is always a bit of a tricky thing to do. The Likelihood ratio test only works for comparing nested models, i.e. models where one is a special/restricted case of the other model, which is unfortunately not the case here.\n",
+    "\n",
+    "However, non-nested models can be compared (informally) using the Akaike Information Criterion (AIC), defined as:\n",
+    "\n",
+    "\\begin{equation}\n",
+    "AIC = 2k - 2\\ln(\\hat{L})\n",
+    "\\end{equation}\n",
+    "\n",
+    "where $k$ is the number of parameters of the model and $\\hat{L}$ the maximum lilelihood.\n",
+    "\n",
+    "Given that our models have parameters of either on the order of $\\sim 10^1$ whereas $\\ln(\\hat{L}) \\sim 10^6$ we can pretty much just compare likelihoods directly.\n",
+    "\n",
+    "### For the LeNet 5 we conclude:\n",
+    "\n",
+    "1. Suprisingly, the simple Dirichlet fits the real NN better than the Multi-Dirichlet with $\\sigma > 1$, even though visually it looks like the Multi-Dirichlet captures/reproduces the marginal distributions of the real NN a lot better\n",
+    "\n",
+    "2. Also suprisingly, fitting the Multi-Dirichlet directly gives worse results according to the AIC comared to fitting it via the approximation of 'class splitting' (even though, again, the directly fitted Multi-Dirichlet marginal distributions looks more 'real')\n",
+    "\n",
+    "3. Allowing for $\\sigma < 1$ improves the AIC a lot for both directly and class split fitted Multi-Dirichlet FC's\n",
+    "\n",
+    "The 'most real' results seem to be achieved using the directly fitted Multi-Dirichlet FC and allowing sigma < 1. While it does actually result in a slightly worse AIC than the same class-split FC the marginal distributions look a lot more 'real'.\n",
+    "\n",
+    "### For the ResNet 20 we conclude:\n",
+    "\n",
+    "Similar results as for the LeNet 5 with one major exception:\n",
+    "\n",
+    "The difference between the class split and directly fitted multi dirichlet FC is negligibly small. This not a suprise as we expect the more complex NN's to be more confident in general, i.e. for the confidences to be more concentrated in the corners. This makes the more complex NN's closer to a 'sufficiently confident like' behaviour that we based the class split fitting approximation on.\n",
+    "\n",
+    "Again the most promising results seem to be achieved using the directly fitted Multi-Dirichlet FC and allowing sigma < 1.\n",
+    "\n",
+    "### For the ResNet 110 we conclude:\n",
+    "\n",
+    "Pretty much almost the same as for the ResNet 20\n",
+    "\n",
+    "### In General:\n",
+    "\n",
+    "The Multi-Dirichlet FC seems to be a lot better at reproducing the ResNet 20 and ResNet 110 confidence distributions. The LeNet 5 is more difficult for the Multi-Dirichlet FC to reproduce. This is likely due to the LeNet 5 not being 'overconfident enough', i.e. when looking at  the LeNet's 'top class' confidence distribution a fair amount of confidences lie in the middle of the distribution at $p \\sim 0.5$. For ResNet 20 and ResNet 110 this is not the case, pretty much all confidences lie at the extreme value of $p \\sim 1.0$."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "207568b4",
+   "metadata": {},
+   "source": [
+    "Maybe TODO: Try other fitting methods: e.g. moment matching or fitting only to the marginal distributions\n",
+    "\n",
+    "Maybe TODO: Try 'stochastic fitting' for the multi-gaussian FC where we sample the FC calculate the corrseponding marginal distributions and minimize e.g. the squared error loss between the neural network marginal distributions and the marginal distributions sampled from the FC. Could get around problem of multi-gaussian FC having analytically intractable pdf\n",
+    "\n",
+    "TODO: Find a good simplex automorphism to also recreate the reliability diagrams of the neural networks."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "886ed246",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14560a0f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b9bbf6b5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "848c1ade",
+   "metadata": {},
+   "source": [
+    "## This didn't really work but it's here for completeness"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7076089c",
+   "metadata": {},
+   "source": [
+    "## Further Modifications to Fitting Procedure\n",
+    "\n",
+    "There are definitely more ways to improve the fitting procedure (see https://epub.wu.ac.at/4077/1/Report125.pdf).\n",
+    "\n",
+    "A relatively simple one is to transform the parameters before fitting. In general, introducing bounds into minimzation problems can complicate things. However instead of fitting the dirichlet parameters directly we can first rescale/transform them onto the interval $[-\\infty, \\infty]$ (Very similar to the idea of generalized linear models, just that we do not have predictor variables in our case).\n",
+    "\n",
+    "For the simple dirichlet FC and appropiate rescaling would be $log(\\cdot)$ mapping the allowed alpha interval $[0, \\infty]$ onto $[-\\infty, \\infty]$\n",
+    "\n",
+    "For the multi-dirichlet FC, with $\\sigma > 0$ and $\\alpha > 1$ an appropiate transform would be e.g. $log(\\sigma)$ and $log(\\alpha - 1)$"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0416f7fe",
+   "metadata": {},
+   "source": [
+    "Below we fitted the Multi-Dirichlet FC again, using both the direct and 'clas split' fitting procedure, however this time with the above transformstions applied to the parameters."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b629f308",
+   "metadata": {},
+   "source": [
+    "**Note**:\n",
+    "\n",
+    "As it turns out rescaling the parameters might not be that great of an idea. The problem is that the minimization algorithms end up taking stepsizes of up to $10$ in $\\log(\\alpha)$ which results in $\\alpha \\sim 10^{10}$. Even after rescaling the confidences to avoid 0 values, we still run into problems with such a large alpha. For a value of $c \\sim 10^{-5}$, we get $c^\\alpha \\sim 10^{-50}$ which might as well be 0 and will lead to errors in the log-likelihood. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fde5e3f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#rescale the confidences to avoid 0 confidences and renormalize\n",
+    "\n",
+    "num_classes = confidences.shape[1]\n",
+    "\n",
+    "rescaled_confidences = (confidences * (confidences.shape[0] - 1) + 1/num_classes) / confidences.shape[0]\n",
+    "rescaled_confidences = rescaled_confidences / np.sum(rescaled_confidences, axis=1)[:,None]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2afdb89c",
+   "metadata": {},
+   "source": [
+    "### First the simultaneous fit of 30 parameters:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cfba92c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MultiDir_FC = MultiDirichletFC(num_classes)\n",
+    "\n",
+    "def MultiDir_NLL(log_alpha, log_sigma, logit_distribution_weights):\n",
+    "    alpha = np.exp(log_alpha)+1\n",
+    "    sigma = np.exp(log_sigma)\n",
+    "    distribution_weights = np.exp(logit_distribution_weights) / (1 + np.exp(logit_distribution_weights))\n",
+    "    return -np.sum(np.log( MultiDir_FC.pdf(rescaled_confidences, alpha, sigma, distribution_weights) ))\n",
+    "    \n",
+    "#initial guesses for fitting multi-dirichlet (no longer need bounds due to log transformations)\n",
+    "init_log_alpha = np.zeros(num_classes)\n",
+    "init_log_sigma = np.zeros(num_classes)\n",
+    "init_logit_distribution_weights = np.zeros(num_classes)\n",
+    "\n",
+    "MultiDirichlet_bestfit = scipy.optimize.minimize(lambda parms: MultiDir_NLL(*np.split(parms,3)),\n",
+    "                                                 np.concatenate((init_log_alpha, init_log_sigma, init_logit_distribution_weights)),\n",
+    "                                                 options={'disp': True})\n",
+    "print(f'final NLL = {MultiDirichlet_bestfit.fun}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ce29935a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Set FC parameters to those found from MLE fit\n",
+    "l_a, l_s, l_w = np.split(MultiDirichlet_bestfit.x,3)\n",
+    "MultiDir_FC.set_parameters(np.exp(l_a)+1, np.exp(l_s), np.exp(l_w) / (1 + np.exp(l_w)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46ef2bc2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('alpha_k = {}\\n\\nsigma_k = {}\\n\\ndistribution_weights = {}'.format(*np.split(MultiDirichlet_bestfit.x,3)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6810412e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MultiDir_FC_eval = EvalStats(*MultiDir_FC.get_sample_arrays(50000))\n",
+    "NNet_eval = EvalStats(gt_labels, confidences)\n",
+    "\n",
+    "MultiDir_FC_eval.plot_confidence_distributions([0,\"top_class\"])\n",
+    "NNet_eval.plot_confidence_distributions([0,\"top_class\"])\n",
+    "\n",
+    "for i in range(3):\n",
+    "    MultiDir_FC_eval.plot_confidence_distributions([i])\n",
+    "    NNet_eval.plot_confidence_distributions([i])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "90c4a3e1",
+   "metadata": {},
+   "source": [
+    "### And secondly the 10 fits of 2 parameters each:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d405235",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class_split_confidences = [rescaled_confidences[np.argmax(rescaled_confidences, axis=1)==i, :] for i in range(num_classes)]\n",
+    "\n",
+    "estimated_distribution_weights = [k_class_confidences.shape[0] for k_class_confidences in class_split_confidences]\n",
+    "estimated_distribution_weights = estimated_distribution_weights / np.sum(estimated_distribution_weights)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cad35460",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Dir_FC = DirichletFC(num_classes)\n",
+    "MultiDir_FC = MultiDirichletFC(num_classes)\n",
+    "\n",
+    "alpha_k = []\n",
+    "sigma_k = []\n",
+    "NLL_k = []\n",
+    "\n",
+    "for k, k_class_confidences in enumerate(class_split_confidences):\n",
+    "    \n",
+    "    def k_dir_NLL(log_alpha, log_sigma):\n",
+    "        alpha_vector = np.ones(num_classes)\n",
+    "        alpha_vector[k] = np.exp(log_alpha)+1 \n",
+    "        alpha_vector *= np.exp(log_sigma)\n",
+    "        return -np.sum(np.log( Dir_FC.pdf(k_class_confidences, alpha_vector) ))\n",
+    "\n",
+    "    #initial guesses fitting individual dirichlets (no longer need bounds due to log transformations)\n",
+    "    init_log_alpha = 0\n",
+    "    init_log_sigma = 0\n",
+    "\n",
+    "    k_dir_bestfit = scipy.optimize.minimize(lambda parms: k_dir_NLL(*parms), np.array([init_log_alpha, init_log_sigma]),\n",
+    "                                            options={'disp': True})\n",
+    "    \n",
+    "    alpha_k.append(np.exp(k_dir_bestfit.x[0])+1)\n",
+    "    sigma_k.append(np.exp(k_dir_bestfit.x[1]))\n",
+    "    NLL_k.append(k_dir_bestfit.fun)\n",
+    "\n",
+    "print(f'final NLL = {np.sum(NLL_k)}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b8bbee01",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MultiDir_FC.set_parameters(alpha_k, sigma_k, estimated_distribution_weights)\n",
+    "print('alpha_k = {}\\n\\nsigma_k = {}\\n\\ndistribution_weights = {}'.format(*MultiDir_FC.get_parameters()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f8a984b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MultiDir_FC_eval = EvalStats(*MultiDir_FC.get_sample_arrays(50000))\n",
+    "NNet_eval = EvalStats(gt_labels, confidences)\n",
+    "\n",
+    "MultiDir_FC_eval.plot_confidence_distributions([0,\"top_class\"])\n",
+    "NNet_eval.plot_confidence_distributions([0,\"top_class\"])\n",
+    "\n",
+    "for i in range(3):\n",
+    "    MultiDir_FC_eval.plot_confidence_distributions([i])\n",
+    "    NNet_eval.plot_confidence_distributions([i])\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/kale/__init__.py b/public/.nojekyll
similarity index 100%
rename from src/kale/__init__.py
rename to public/.nojekyll
diff --git a/public/coverage/.gitignore b/public/coverage/.gitignore
new file mode 100644
index 0000000..e69de29
diff --git a/public/docs/.gitignore b/public/docs/.gitignore
new file mode 100644
index 0000000..e69de29
diff --git a/public/index.html b/public/index.html
new file mode 100644
index 0000000..8eafa58
--- /dev/null
+++ b/public/index.html
@@ -0,0 +1,15 @@
+<!doctype html>
+<html lang="en">
+  <head>
+    <title>kyle project pages</title>
+  </head>
+  <body>
+    <h1 style="margin-bottom:3cm;"> Welcome to the kyle project pages!</h1>
+    <p style="margin-bottom:3cm;"> This page hosts the documentation and reports from the develop branch of the project</p>
+    <ul>
+      <li><a href="docs">Source Code Documentation</a> </li>
+      <li><a href="coverage"> Latest coverage report</a> </li>
+    </ul>
+  </body>
+</html>
+<!--TODO: this page is about as ugly as it gets, it should be easy to improve-->
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 08174f4..0fd08c9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,3 +5,7 @@ requires = [
     "setuptools_scm >= 2.0.0, <3"
 ]
 build-backend = "setuptools.build_meta"
+# Black-compatible settings for isort
+# See https://black.readthedocs.io/en/stable/compatible_configs.html
+[tool.isort]
+profile = "black"
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..e618d7a
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+testpaths =
+    tests
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..f016374
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,8 @@
+tox
+jupyter
+pytest
+pylint
+bump2version
+anybadge
+pandas
+tqdm
diff --git a/requirements-torch.txt b/requirements-torch.txt
new file mode 100644
index 0000000..966d63a
--- /dev/null
+++ b/requirements-torch.txt
@@ -0,0 +1,4 @@
+torch==1.6.0
+torchvision==0.7.0
+kornia~=0.5
+pytorch-lightning==1.2.8
diff --git a/requirements.txt b/requirements.txt
index 405584b..9f0cfe4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-numpy==1.18.4
-torch==1.5.0
-pyro-ppl==1.3.1
-pandas==1.0.3
-scikit-learn==0.23.1
\ No newline at end of file
+numpy~=1.18.4
+scikit-learn~=0.23.1
+matplotlib~=3.2.1
+scipy~=1.4
+netcal~=1.0
diff --git a/scripts/run_sample.py b/scripts/run_sample.py
deleted file mode 100644
index a9dd6b1..0000000
--- a/scripts/run_sample.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from utils import prepare_imports
-prepare_imports()
-
-from kale.sample_module import SampleClass
-from config import get_config
-
-if __name__ == "__main__":
-
-    c = get_config()
-    assert c.sample_key == "sample_value"
-    print(SampleClass().sample_method("Miguel and Mischa"))
-    print("Your library project kale is done waiting for you!")
\ No newline at end of file
diff --git a/scripts/utils.py b/scripts/utils.py
deleted file mode 100644
index 65b917a..0000000
--- a/scripts/utils.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import logging
-import os
-import sys
-
-log = logging.getLogger(__name__)
-
-
-def prepare_imports():
-    sys.path.insert(0, os.path.abspath(os.path.join(__file__, "../..")))  # insert top level modules
-    # use either the installed library or the modules in src (this way scripts can be executed in both scenarios)
-    try:
-        import kale
-    except ModuleNotFoundError:
-        src_path = os.path.abspath(os.path.join(__file__, "../../src"))
-        log.warning(f"kale library was not installed, "
-                    f"will try to import modules from {src_path}")
-        sys.path.insert(0, src_path)
-        import kale
diff --git a/setup.py b/setup.py
index 092e3b1..af30159 100644
--- a/setup.py
+++ b/setup.py
@@ -1,21 +1,25 @@
 from setuptools import find_packages, setup
 
-test_requirements = ['pytest']
-docs_requirements = ['Sphinx==2.4.2', 'sphinxcontrib-websupport==1.2.0', 'sphinx_rtd_theme']
+test_requirements = ["pytest"]
+docs_requirements = [
+    "Sphinx==3.2.1",
+    "sphinxcontrib-websupport==1.2.4",
+    "sphinx_rtd_theme",
+]
 
 setup(
-    name='kale',
+    name="kyle",
     package_dir={"": "src"},
     packages=find_packages(where="src"),
+    python_requires=">=3.8",
+    license="MIT",
+    url="https://github.com/appliedAI-Initiative/kyle",
     include_package_data=True,
-    version='0.1.0',
-    description='Library for kale',
+    version="0.1.0",
+    description="appliedAI classifier calibration library",
     install_requires=open("requirements.txt").readlines(),
     setup_requires=["wheel"],
     tests_require=test_requirements,
-    extras_require={
-        "test": test_requirements,
-        "docs": docs_requirements
-    },
-    author='Miguel and Mischa'
+    extras_require={"test": test_requirements, "docs": docs_requirements},
+    author="appliedAI",
 )
diff --git a/src/kale/sampling.py b/src/kale/sampling.py
deleted file mode 100644
index 779dafe..0000000
--- a/src/kale/sampling.py
+++ /dev/null
@@ -1,109 +0,0 @@
-from abc import ABC, abstractmethod
-from typing import Sequence
-
-import numpy as np
-import pyro
-import pyro.distributions as dist
-import torch
-from torch import tensor
-
-
-class SimplexAutomorphism(ABC):
-    """
-    Base class for all simplex automorphisms
-
-    :param num_classes: The dimension of the simplex vector. This equals 1 + (dimension of the simplex as manifold)
-    """
-    def __init__(self, num_classes: int):
-        self.num_classes = num_classes
-
-    def _in_simplex(self, x: np.ndarray):
-        return len(x) == self.num_classes and np.isclose(sum(x), 1) and all(x >= 0) and all(x <= 1)
-
-    @abstractmethod
-    def _transform(self, x: np.ndarray) -> np.ndarray:
-        pass
-
-    def transform(self, x: np.ndarray):
-        if not self._in_simplex(x):
-            raise ValueError(f"Input has to be from a {self.num_classes - 1} dimensional simplex")
-        result = self._transform(x)
-        if not self._in_simplex(result):
-            raise Exception(f"Bad implementation: Output has to be from a {self.num_classes - 1} dimensional simplex")
-        return result
-
-
-class IdentitySimplexAutomorphism(SimplexAutomorphism):
-    def _transform(self, x: np.ndarray) -> np.ndarray:
-        return x
-
-
-class ScalingSimplexAutomorphism(SimplexAutomorphism):
-    """
-    An automorphism that scales each axis/class with the corresponding parameter and normalizes the result such
-    tha it sums 1. If all scaling parameters are equal, this corresponds to the identity operation.
-
-    :param num_classes:
-    :param scaling_parameters: array with positive numbers, one per class
-    """
-    def __init__(self, num_classes: int, scaling_parameters: np.ndarray):
-        assert (l_def := len(scaling_parameters)) == num_classes, \
-            f"scaling parameters has wrong number of classes {l_def}"
-        self.scaling_parameters = scaling_parameters
-        super().__init__(num_classes)
-
-    def _transform(self, x: np.ndarray) -> np.ndarray:
-        x = np.multiply(x, self.scaling_parameters)
-        return x/x.sum()
-
-
-class FakeClassifier:
-    """
-    A fake classifier for sampling ground truth and class probabilities vectors,
-    see https://gitlab.aai.lab/tl/calibration/texts for more details.
-    By default instantiated with uniform distributions and trivial simplex automorphisms, these can be adjusted
-    after instantiation.
-
-    :param num_classes: Number of ground truth classes, must be larger than 1
-    """
-    def __init__(self, num_classes: int):
-        assert num_classes > 1, f"{self.__class__.__name__} requires at least two classes"
-        self.num_classes = num_classes
-        self.predicted_class_categorical = dist.Categorical(torch.ones(self.num_classes))
-        self.dirichlet_dists = [dist.Dirichlet(torch.ones(self.num_classes) / self.num_classes)] * self.num_classes
-        self.simplex_automorphisms = [IdentitySimplexAutomorphism(self.num_classes)] * self.num_classes
-
-    def _unit_vector(self, i: int):
-        e_i = np.zeros(self.num_classes)
-        e_i[i] = 1
-        return e_i
-
-    def with_predicted_class_categorical(self, weights: Sequence[float]):
-        assert len(weights) == self.num_classes, \
-            f"Expected {self.num_classes} probabilities of categorical distribution"
-        self.predicted_class_categorical = dist.Categorical(tensor(weights))
-        return self
-
-    def with_simplex_automorphisms(self, simplex_automorphisms: Sequence[SimplexAutomorphism]):
-        assert len(simplex_automorphisms) == self.num_classes, f"Expected {self.num_classes} simplex automorphisms"
-        for i, aut in enumerate(simplex_automorphisms):
-            if aut.num_classes != self.num_classes:
-                raise ValueError(f"simplex automorphism {i} has wrong number of classes: {aut.num_classes}")
-        self.simplex_automorphisms = simplex_automorphisms
-        return self
-
-    def with_dirichlet_distributions(self, dirichlet_dists: Sequence[dist.Dirichlet]):
-        assert len(dirichlet_dists) == self.num_classes, f"Expected {self.num_classes} dirichlet_distributions"
-        for i, dirichlet_dist in enumerate(dirichlet_dists):
-            if not dirichlet_dist.shape()[0] == self.num_classes:
-                raise ValueError(f"dirichlet distribution {i} has wrong shape: {dirichlet_dist.shape()}")
-        self.dirichlet_dists = dirichlet_dists
-        return self
-
-    def get_sample(self):
-        predicted_class = pyro.sample("predicted_class", self.predicted_class_categorical).item()
-        k = pyro.sample("k", self.dirichlet_dists[predicted_class]).numpy()
-        probabilities_vector = 1/2 * (k + self._unit_vector(predicted_class))
-        ground_truth_label = self.simplex_automorphisms[predicted_class].transform(k).argmax()
-        return ground_truth_label, probabilities_vector
-
diff --git a/src/kyle/__init__.py b/src/kyle/__init__.py
new file mode 100644
index 0000000..3dc1f76
--- /dev/null
+++ b/src/kyle/__init__.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
diff --git a/src/kyle/calibration/__init__.py b/src/kyle/calibration/__init__.py
new file mode 100644
index 0000000..9e8ce63
--- /dev/null
+++ b/src/kyle/calibration/__init__.py
@@ -0,0 +1 @@
+from .model_calibrator import ModelCalibrator
diff --git a/src/kyle/calibration/calibration_methods/__init__.py b/src/kyle/calibration/calibration_methods/__init__.py
new file mode 100644
index 0000000..efad157
--- /dev/null
+++ b/src/kyle/calibration/calibration_methods/__init__.py
@@ -0,0 +1 @@
+from .calibration_methods import *
diff --git a/src/kyle/calibration/calibration_methods/calibration_methods.py b/src/kyle/calibration/calibration_methods/calibration_methods.py
new file mode 100644
index 0000000..8bc4870
--- /dev/null
+++ b/src/kyle/calibration/calibration_methods/calibration_methods.py
@@ -0,0 +1,177 @@
+from abc import ABC, abstractmethod
+from typing import Generic, List, Optional, TypeVar
+
+import netcal.binning as bn
+import netcal.scaling as scl
+import numpy as np
+from netcal import AbstractCalibration
+from sklearn.base import BaseEstimator
+
+
+class BaseCalibrationMethod(ABC, BaseEstimator):
+    @abstractmethod
+    def fit(self, confidences: np.ndarray, ground_truth: np.ndarray):
+        pass
+
+    @abstractmethod
+    def get_calibrated_confidences(self, confidences: np.ndarray):
+        pass
+
+    def __str__(self):
+        return self.__class__.__name__
+
+
+def _get_confidences_from_netcal_calibrator(
+    confidences: np.ndarray, calibrator: AbstractCalibration
+):
+    calibrated_confs = calibrator.transform(confidences)
+
+    # TODO: there is a whole bunch of hacks here. I want to get rid of netcal, don't like the code there
+    # unfortunately, for 2-dim input netcal gives only the probabilities for the second class,
+    # changing the dimension of the output array
+    if calibrated_confs.ndim < 2:
+        second_class_confs = calibrated_confs
+        first_class_confs = 1 - second_class_confs
+        calibrated_confs = np.stack([first_class_confs, second_class_confs], axis=1)
+
+        if (
+            len(confidences) == 1
+        ):  # Netcal has a bug for single data points, this is a dirty fix
+            calibrated_confs = calibrated_confs[None, 0]
+
+        if calibrated_confs.shape != confidences.shape:
+            raise RuntimeError(
+                f"Shape mismatch for input {confidences}, output {calibrated_confs}. "
+                f"Netcal output: {second_class_confs}"
+            )
+
+    return calibrated_confs
+
+
+TNetcalModel = TypeVar("TNetcalModel", bound=AbstractCalibration)
+
+
+# TODO: this is definitely not the final class structure. For now its ok, I want to completely decouple from netcal soon
+class NetcalBasedCalibration(BaseCalibrationMethod, Generic[TNetcalModel]):
+    def __init__(self, netcal_model: TNetcalModel):
+        self.netcal_model = netcal_model
+
+    def fit(self, confidences: np.ndarray, ground_truth: np.ndarray):
+        self.netcal_model.fit(confidences, ground_truth)
+
+    def get_calibrated_confidences(self, confidences: np.ndarray) -> np.ndarray:
+        return _get_confidences_from_netcal_calibrator(confidences, self.netcal_model)
+
+
+class TemperatureScaling(NetcalBasedCalibration[scl.TemperatureScaling]):
+    def __init__(self):
+        super().__init__(scl.TemperatureScaling())
+
+
+class BetaCalibration(NetcalBasedCalibration[scl.BetaCalibration]):
+    def __init__(self):
+        super().__init__(scl.BetaCalibration())
+
+
+class LogisticCalibration(NetcalBasedCalibration[scl.LogisticCalibration]):
+    def __init__(self):
+        super().__init__(scl.LogisticCalibration())
+
+
+class IsotonicRegression(NetcalBasedCalibration[bn.IsotonicRegression]):
+    def __init__(self):
+        super().__init__(bn.IsotonicRegression())
+
+
+class HistogramBinning(NetcalBasedCalibration[bn.HistogramBinning]):
+    def __init__(self, bins=20):
+        super().__init__(bn.HistogramBinning(bins=20))
+
+
+class ClassWiseCalibration(BaseCalibrationMethod):
+    def __init__(self, calibration_method_factory=TemperatureScaling):
+        self.calibration_method_factory = calibration_method_factory
+        self.n_classes: Optional[int] = None
+        self.calibration_methods: Optional[List[BaseCalibrationMethod]] = None
+
+    # TODO: maybe parallelize this and predict
+    def fit(self, confidences: np.ndarray, labels: np.ndarray):
+        self.n_classes = confidences.shape[1]
+        self.calibration_methods = []
+        for class_label in range(self.n_classes):
+            calibration_method = self.calibration_method_factory()
+            selected_confs, selected_labels = get_class_confs_labels(
+                class_label, confidences, labels
+            )
+            calibration_method.fit(selected_confs, selected_labels)
+            self.calibration_methods.append(calibration_method)
+
+    def get_calibrated_confidences(self, confs: np.ndarray):
+        result = np.zeros(confs.shape)
+        argmax = confs.argmax(1)
+        for class_label in range(self.n_classes):
+            scaler = self.calibration_methods[class_label]
+            indices = argmax == class_label
+            selected_confs = confs[indices]
+            calibrated_confs = scaler.get_calibrated_confidences(selected_confs)
+            assert calibrated_confs.shape == selected_confs.shape, (
+                f"Expected shape {selected_confs.shape} but got {calibrated_confs.shape}. Confs: "
+                f"{selected_confs}, output: {calibrated_confs}"
+            )
+
+            result[indices] = calibrated_confs
+        return result
+
+
+class ConfidenceReducedCalibration(BaseCalibrationMethod, BaseEstimator):
+    def __init__(self, calibration_method=TemperatureScaling()):
+        self.calibration_method = calibration_method
+
+    def fit(self, confidences: np.ndarray, ground_truth: np.ndarray):
+        reduced_confs, reduced_gt = get_binary_classification_data(
+            confidences, ground_truth
+        )
+        self.calibration_method.fit(reduced_confs, reduced_gt)
+
+    def get_calibrated_confidences(self, confidences: np.ndarray):
+        reduced_confs = get_reduced_confidences(confidences)
+        reduced_predictions = self.calibration_method.get_calibrated_confidences(
+            reduced_confs
+        )
+        reduced_predictions = reduced_predictions[:, 0]  # take only 0-class prediction
+        n_classes = confidences.shape[1]
+        non_predicted_class_confidences = (1 - reduced_predictions) / (n_classes - 1)
+
+        # using broadcasting here
+        calibrated_confidences = (
+            non_predicted_class_confidences * np.ones(confidences.shape).T
+        )
+        calibrated_confidences = calibrated_confidences.T
+
+        argmax_indices = np.expand_dims(confidences.argmax(axis=1), axis=1)
+        np.put_along_axis(
+            calibrated_confidences, argmax_indices, reduced_predictions[:, None], axis=1
+        )
+        assert np.all(
+            np.isclose(calibrated_confidences.sum(1), np.ones(len(confidences)))
+        )
+        assert calibrated_confidences.shape == confidences.shape
+        return calibrated_confidences
+
+
+def get_class_confs_labels(c: int, confidences: np.ndarray, labels: np.ndarray):
+    indices = confidences.argmax(1) == c
+    return confidences[indices], labels[indices]
+
+
+def get_reduced_confidences(confidences: np.ndarray):
+    top_class_predictions = confidences.max(axis=1)
+    return np.stack([top_class_predictions, 1 - top_class_predictions], axis=1)
+
+
+def get_binary_classification_data(confidences: np.ndarray, labels: np.ndarray):
+    new_confidences = get_reduced_confidences(confidences)
+    pred_was_correct = labels == confidences.argmax(axis=1)
+    # this is a hack - we predict class 0 if pred was correct, else class 1
+    new_gt = (np.logical_not(pred_was_correct)).astype(int)
+    return new_confidences, new_gt
diff --git a/src/kyle/calibration/model_calibrator.py b/src/kyle/calibration/model_calibrator.py
new file mode 100644
index 0000000..368f1be
--- /dev/null
+++ b/src/kyle/calibration/model_calibrator.py
@@ -0,0 +1,28 @@
+import numpy as np
+
+from kyle.models import CalibratableModel
+
+
+class ModelCalibrator:
+    def __init__(
+        self,
+        X_calibrate: np.ndarray,
+        y_calibrate: np.ndarray,
+        X_fit: np.ndarray = None,
+        y_fit: np.ndarray = None,
+    ):
+        self.X_calibrate = X_calibrate
+        self.y_calibrate = y_calibrate
+        self.X_fit = X_fit
+        self.y_fit = y_fit
+
+    def calibrate(self, calibratable_model: CalibratableModel, fit: bool = False):
+        if fit:
+            if self.X_fit is None or self.y_fit is None:
+                raise AttributeError("No dataset for fitting provided")
+            calibratable_model.fit(self.X_fit, self.y_fit)
+
+        calibratable_model.calibrate(self.X_calibrate, self.y_calibrate)
+
+    def __str__(self):
+        return self.__class__.__name__
diff --git a/src/kyle/datasets.py b/src/kyle/datasets.py
new file mode 100644
index 0000000..380b7e2
--- /dev/null
+++ b/src/kyle/datasets.py
@@ -0,0 +1,57 @@
+import torch
+from kornia.enhance import denormalize
+from torch import Tensor, tensor
+from torch.utils.data import DataLoader
+from torchvision.datasets import CIFAR10
+from torchvision.transforms import transforms
+
+# see https://github.com/akamaster/pytorch_resnet_cifar10
+resnet_normalize_transform = transforms.Normalize(
+    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+)
+
+
+def resnet_denormalize_transform(data: Tensor):
+    is_batch = len(data.shape) == 4
+    if not is_batch:
+        data = data[None, :]  # transform only works on batches
+    result = denormalize(
+        data,
+        tensor(resnet_normalize_transform.mean),
+        tensor(resnet_normalize_transform.std),
+    )
+    if not is_batch:
+        result = result[0]
+    return result
+
+
+def get_cifar10_dataloader(path: str, train=False):
+    dataset = CIFAR10(
+        path,
+        train=train,
+        download=True,
+        transform=transforms.Compose(
+            [transforms.ToTensor(), resnet_normalize_transform]
+        ),
+    )
+    dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=1)
+    return dataloader
+
+
+def get_cifar10_dataset(path: str, train=False):
+    dataset = CIFAR10(
+        path,
+        train=train,
+        download=True,
+        transform=transforms.Compose(
+            [transforms.ToTensor(), resnet_normalize_transform]
+        ),
+    )
+    images = []
+    targets = []
+    # Quick hack, can't find a nice way of doing that. Datasets cannot be sliced and we need the transform
+    # Alternative is to retrieve the .data array and do the transformations and reshaping ourselves but this is brittle
+    for image, target in dataset:
+        images.append(image)
+        targets.append(target)
+    return torch.stack(images), torch.tensor(targets)
diff --git a/src/kyle/evaluation/__init__.py b/src/kyle/evaluation/__init__.py
new file mode 100644
index 0000000..b9c121c
--- /dev/null
+++ b/src/kyle/evaluation/__init__.py
@@ -0,0 +1,2 @@
+from .continuous import compute_accuracy, compute_ECE, compute_expected_max
+from .discrete import EvalStats
diff --git a/src/kyle/evaluation/continuous.py b/src/kyle/evaluation/continuous.py
new file mode 100644
index 0000000..cc9e387
--- /dev/null
+++ b/src/kyle/evaluation/continuous.py
@@ -0,0 +1,168 @@
+import numpy as np
+from scipy.integrate import quad
+from scipy.stats import dirichlet
+
+from kyle.integrals import (
+    dirichlet_exp_value,
+    simplex_integral_fixed_comp,
+    simplex_integral_fixed_max,
+)
+from kyle.sampling.fake_clf import DirichletFC
+from kyle.transformations import SimplexAut
+
+
+def _prob_correct_prediction(conf: np.ndarray, simplex_aut: SimplexAut):
+    conf = conf.squeeze()
+    gt_probabilities = simplex_aut.transform(conf, check_io=False)
+    return gt_probabilities[np.argmax(conf)]
+
+
+def _prob_class(conf: np.ndarray, simplex_aut: SimplexAut, selected_class: int):
+    conf = conf.squeeze()
+    gt_probabilities = simplex_aut.transform(conf, check_io=False)
+    return gt_probabilities[selected_class]
+
+
+def _probability_vector(*parametrization: float):
+    return np.array(list(parametrization) + [1 - np.sum(parametrization)])
+
+
+def compute_accuracy(dirichlet_fc: DirichletFC, **kwargs):
+    def integrand(*parametrization):
+        conf = _probability_vector(*parametrization)
+        return _prob_correct_prediction(conf, dirichlet_fc.simplex_automorphism)
+
+    return dirichlet_exp_value(integrand, dirichlet_fc.alpha, **kwargs)
+
+
+def compute_expected_max(dirichlet_fc: DirichletFC, **kwargs):
+    def integrand(*parametrization):
+        conf = _probability_vector(*parametrization)
+        return np.max(conf)
+
+    return dirichlet_exp_value(integrand, dirichlet_fc.alpha, **kwargs)
+
+
+def compute_ECE(dirichlet_fc: DirichletFC, conditioned="full", **kwargs):
+    """
+    Computes theoretical ECE of dirichlet_fc Fake Classifier conditioned on full confidence vector, conditioned on the
+    confidence in prediction or conditioned on each class confidence separately (see [1]_ for further details)
+
+    :param dirichlet_fc: Dirichlet fake classifier to calculate ECE for
+    :param conditioned: Quantity to condition ECE on
+    :param kwargs: passed to integrator function
+    :return: * If conditioned on full confidence vector returns: result, abserr, (further scipy.nquad output)
+             * If conditioned on the confidence in prediction returns: result, abserr, (further scipy.quad output)
+             * If conditioned on each class separately returns: List of num_classes+1 entries. First entry contains
+               average of all "i-class ECEs". Subsequent entries contain results for each "i-class ECE"
+               separately: result, abserr, (further scipy.quad output)
+
+    References
+    ----------
+    .. [1] Kull, M., Perello-Nieto, M., Kängsepp, M., Filho, T. S., Song, H., & Flach, P. (2019). Beyond temperature
+        scaling: Obtaining well-calibrated multiclass probabilities with Dirichlet calibration.
+    """
+
+    if conditioned == "full":
+        return _compute_ECE_full(dirichlet_fc, **kwargs)
+    elif conditioned == "confidence":
+        return _compute_ECE_conf(dirichlet_fc, **kwargs)
+    elif conditioned == "class":
+        return _compute_ECE_class(dirichlet_fc, **kwargs)
+    else:
+        raise ValueError("ECE has to be one of fully, confidence or class conditioned")
+
+
+def _compute_ECE_full(dirichlet_fc: DirichletFC, **kwargs):
+    def integrand(*parametrization):
+        conf = _probability_vector(*parametrization)
+        return np.abs(
+            np.max(conf)
+            - _prob_correct_prediction(conf, dirichlet_fc.simplex_automorphism)
+        )
+
+    return dirichlet_exp_value(integrand, dirichlet_fc.alpha, **kwargs)
+
+
+def _compute_ECE_conf(dirichlet_fc: DirichletFC, **kwargs):
+    # Need higher precision for accurate result due to nesting of two quad/nquad calls
+    # Sets higher precision if precision not already set in **kwargs
+    opts = {"epsabs": 1e-4}
+    opts.update(kwargs.pop("opts", {}))
+    kwargs.update({"opts": opts})
+
+    num_classes = len(dirichlet_fc.alpha)
+
+    def p_c(*parametrization):
+        return dirichlet.pdf(parametrization, dirichlet_fc.alpha)
+
+    def p_y_c(*parametrization):
+        conf = _probability_vector(*parametrization)
+        return _prob_correct_prediction(conf, dirichlet_fc.simplex_automorphism) * p_c(
+            *parametrization
+        )
+
+    def integrand(max_conf):
+        int_p_c = simplex_integral_fixed_max(p_c, num_classes, max_conf, **kwargs)[0]
+        int_p_y_c = simplex_integral_fixed_max(p_y_c, num_classes, max_conf, **kwargs)[
+            0
+        ]
+        return np.abs(int_p_y_c / int_p_c - max_conf) * int_p_c
+
+    # At exactly 1/num_classes or 1 get 0/0
+    boundary_offset = 1e-2
+
+    return quad(
+        integrand,
+        1 / num_classes + boundary_offset,
+        1 - boundary_offset,
+        epsabs=opts["epsabs"],
+    )
+
+
+def _compute_ECE_class(dirichlet_fc: DirichletFC, **kwargs):
+    # Need higher precision for accurate result due to nesting of two quad/nquad calls
+    # Sets higher precision if precision not already set in **kwargs
+    opts = {"epsabs": 1e-4}
+    opts.update(kwargs.pop("opts", {}))
+    kwargs.update({"opts": opts})
+
+    num_classes = len(dirichlet_fc.alpha)
+
+    integral_results = []
+
+    for i in range(num_classes):
+
+        def p_c(*parametrization):
+            return dirichlet.pdf(parametrization, dirichlet_fc.alpha)
+
+        def p_y_c(*parametrization):
+            conf = _probability_vector(*parametrization)
+            return _prob_class(conf, dirichlet_fc.simplex_automorphism, i) * p_c(
+                *parametrization
+            )
+
+        def integrand(comp_conf):
+            int_p_c = simplex_integral_fixed_comp(
+                p_c, num_classes, i, comp_conf, **kwargs
+            )[0]
+            int_p_y_c = simplex_integral_fixed_comp(
+                p_y_c, num_classes, i, comp_conf, **kwargs
+            )[0]
+            return np.abs(int_p_y_c / int_p_c - comp_conf) * int_p_c
+
+        # At exactly 0 or 1 get 0/0
+        boundary_offset = 1e-2
+
+        result = quad(
+            integrand,
+            1 / num_classes + boundary_offset,
+            1 - boundary_offset,
+            epsabs=opts["epsabs"],
+        )
+
+        integral_results.append(result)
+
+    integral_results.insert(0, sum(S[0] for S in integral_results) / num_classes)
+
+    return integral_results
diff --git a/src/kyle/evaluation/discrete.py b/src/kyle/evaluation/discrete.py
new file mode 100644
index 0000000..2f60d21
--- /dev/null
+++ b/src/kyle/evaluation/discrete.py
@@ -0,0 +1,323 @@
+from typing import Sequence, Union
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.colors import ListedColormap
+
+from kyle.util import safe_accuracy_score
+
+
+class EvalStats:
+    TOP_CLASS_LABEL = "top_class"
+
+    """
+    Class for computing evaluation statistics of classifiers, including calibration metrics
+
+    :param y_true: integer array of shape (n_samples,)
+    :param confidences: array of shape (n_samples, n_classes)
+    :param bins: on how many homogeneous bins to evaluate the statistics
+    """
+
+    def __init__(self, y_true: np.ndarray, confidences: np.ndarray, bins=30):
+        assert (
+            len(y_true.shape) == 1
+        ), f"y_true has to be 1-dimensional, instead got shape: {y_true.shape}"
+        assert (
+            len(confidences.shape) == 2
+        ), f"predicted_probabilities have to be of shape (#samples, #classes), instead got {confidences.shape}"
+        assert confidences.shape[0] == len(
+            y_true
+        ), f"Mismatch between number of data points in confidences and labels, {confidences.shape[0]} != {len(y_true)}"
+        self.num_samples = len(y_true)
+        self.num_classes = confidences.shape[1]
+        self.y_true = y_true
+        self.y_pred = confidences.argmax(axis=1)
+        self.confidences = confidences
+        self._top_class_confidences = confidences.max(axis=1)
+
+        self.bins: int = None
+        # due to discretization they don't sum to 1 anymore
+        self._discretized_confidences: np.ndarray = None
+        self._discretized_probab_values: np.ndarray = None
+        self.set_bins(bins)
+
+    def expected_confidence(self, class_label: Union[int, str] = TOP_CLASS_LABEL):
+        """
+        Returns the expected confidence for the selected class or for the predictions (default)
+
+        :param class_label: either the class label as int or "top_class"
+        :return:
+        """
+        if class_label == self.TOP_CLASS_LABEL:
+            confs = self._top_class_confidences
+        else:
+            confs = self.confidences[:, class_label]
+        return float(np.mean(confs))
+
+    def set_bins(self, bins: int):
+        self.bins = bins
+        self._discretized_probab_values = (np.arange(self.bins) + 0.5) / self.bins
+        bin_boundaries = np.linspace(0, 1, self.bins + 1)
+        bin_boundaries[
+            0
+        ] = -1  # in order to associate predicted probabilities = 0 to the right bin
+        binned_confidences = (
+            np.digitize(x=self.confidences, bins=bin_boundaries, right=True) - 1
+        )
+        self._discretized_confidences = (binned_confidences + 0.5) / self.bins
+
+    def accuracy(self):
+        return safe_accuracy_score(self.y_true, self.y_pred)
+
+    def marginal_accuracy(self, class_label: int):
+        """
+        Corresponds to acc_i in our calibration paper
+
+        :param class_label:
+        :return:
+        """
+        class_label_mask = self.y_pred == class_label
+        predictions = self.y_pred[class_label_mask]
+        gt = self.y_true[class_label_mask]
+        return np.sum(gt == predictions) / len(self.y_true)
+
+    @staticmethod
+    def _expected_error(
+        probabilities: np.ndarray, members_per_bin: np.ndarray, confidences: np.ndarray
+    ) -> float:
+        """
+        Computes the expected error, being the sum of abs. differences of true probabilities and mean confidences
+        for each bin weighted by the factor N_bin / N_total
+
+        :param probabilities:
+        :param members_per_bin:
+        :return:
+        """
+        total_members = np.sum(members_per_bin)
+        if total_members == 0:
+            return 0.0
+        result = float(np.sum(np.abs(probabilities - confidences) * members_per_bin))
+        result /= total_members
+        return result
+
+    def _non_degenerate_acc_conf_differences(self) -> np.ndarray:
+        """
+        Computes the absolute differences between accuracy and mean confidence for each non-degenerate bin
+        where a bin is considered degenerate if for no confidence vector the maximum lies in the bin.
+        E.g. for a N-classes classifier, all bins with right-hand value below 1/N will be degenerate since the
+        maximum of a probabilities vector is always larger than 1/N.
+
+        :return: array of shape (N_bins, )
+        """
+        accuracies, members_per_bin, confidences = self.top_class_reliabilities()
+        acc_conf_difference = (accuracies - confidences)[members_per_bin > 0]
+        return np.abs(acc_conf_difference)
+
+    def expected_calibration_error(self):
+        accuracies, members_per_bin, confidences = self.top_class_reliabilities()
+        return self._expected_error(accuracies, members_per_bin, confidences)
+
+    def average_calibration_error(self):
+        return np.mean(self._non_degenerate_acc_conf_differences())
+
+    def max_calibration_error(self):
+        return np.max(self._non_degenerate_acc_conf_differences())
+
+    def expected_marginal_calibration_error(self, class_label):
+        """
+        I sort of made this up, although this very probably exists somewhere in the wild
+        :param class_label:
+        """
+        (
+            class_probabilities,
+            members_per_bin,
+            class_confidences,
+        ) = self.marginal_reliabilities(class_label)
+        return self._expected_error(
+            class_probabilities, members_per_bin, class_confidences
+        )
+
+    def average_marginal_calibration_error(self):
+        """
+        I made this up, don't know if this metric was described anywhere yet.
+        It is also not completely clear what this means in terms of probabilistic quantities.
+        """
+        errors = np.zeros(self.num_classes)
+        weights = np.zeros(self.num_classes)
+        for class_label in range(self.num_classes):
+            accuracies, n_members, class_confidences = self.marginal_reliabilities(
+                class_label
+            )
+            total_members = np.sum(n_members)
+            errors[class_label] = self._expected_error(
+                accuracies, n_members, class_confidences
+            )
+            weights[class_label] = total_members
+        return np.sum(errors * weights) / np.sum(weights)
+
+    def class_wise_expected_calibration_error(self):
+        result = sum(
+            self.expected_marginal_calibration_error(k) for k in range(self.num_classes)
+        )
+        result /= self.num_classes
+        return result
+
+    def marginal_reliabilities(self, class_label: int):
+        """
+        Compute the true class probabilities and numbers of members (weights) for each of the N bins for the
+        confidence for the given class.
+
+        :return: tuple of two 1-dim arrays of length N, corresponding to (accuracy_per_bin, num_members_per_bin)
+        """
+        discretized_class_confidences = self._discretized_confidences[:, class_label]
+        class_confidences = self.confidences[:, class_label]
+
+        members_per_bin = np.zeros(self.bins)
+        accuracies_per_bin = np.zeros(self.bins)
+        mean_class_confidences_per_bin = np.zeros(self.bins)
+        for i, probability_bin in enumerate(self._discretized_probab_values):
+            probability_bin_mask = discretized_class_confidences == probability_bin
+            cur_gt_labels = self.y_true[probability_bin_mask]
+            cur_class_confidences = class_confidences[probability_bin_mask]
+
+            cur_members = np.sum(probability_bin_mask)
+            cur_accuracy = safe_accuracy_score(
+                cur_gt_labels, class_label * np.ones(len(cur_gt_labels))
+            )
+            if len(cur_class_confidences) > 0:
+                cur_mean_class_confidence = cur_class_confidences.mean()
+            else:
+                cur_mean_class_confidence = probability_bin
+            members_per_bin[i] = cur_members
+            accuracies_per_bin[i] = cur_accuracy
+            mean_class_confidences_per_bin[i] = cur_mean_class_confidence
+        return accuracies_per_bin, members_per_bin, mean_class_confidences_per_bin
+
+    def top_class_reliabilities(self):
+        """
+        Compute the accuracies and numbers of members (weights) for each of the N bins for top-class confidence.
+
+        :return: tuple of two 1-dim arrays of length N, corresponding to (accuracy_per_bin, num_members_per_bin)
+        """
+        members_per_bin = np.zeros(self.bins)
+        accuracies_per_bin = np.zeros(self.bins)
+        mean_confidences_per_bin = np.zeros(self.bins)
+        discretized_top_class_confidences = self._discretized_confidences.max(axis=1)
+        for i, probability in enumerate(self._discretized_probab_values):
+            probability_bin_mask = discretized_top_class_confidences == probability
+            cur_members = np.sum(probability_bin_mask)
+            if cur_members == 0:
+                members_per_bin[i] = 0
+                accuracies_per_bin[i] = 0
+                mean_confidences_per_bin[i] = 0
+                continue
+
+            cur_gt_labels = self.y_true[probability_bin_mask]
+            cur_pred_labels = self.y_pred[probability_bin_mask]
+            cur_top_class_confidences = self._top_class_confidences[
+                probability_bin_mask
+            ]
+
+            cur_accuracy = safe_accuracy_score(cur_gt_labels, cur_pred_labels)
+            cur_mean_confidence = cur_top_class_confidences.mean()
+            members_per_bin[i] = cur_members
+            accuracies_per_bin[i] = cur_accuracy
+            mean_confidences_per_bin[i] = cur_mean_confidence
+        return accuracies_per_bin, members_per_bin, mean_confidences_per_bin
+
+    # TODO: the reliabilities are plotted above the centers of bins, not above the mean confidences
+    #   The latter would plotting multiple curves at once impossible but the plot would be more precise
+    def plot_reliability_curves(
+        self, class_labels: Sequence[Union[int, str]], display_weights=False
+    ):
+        """
+
+        :param class_labels:
+        :param display_weights: If True, for each reliability curve the weights of each bin will be
+            plotted as histogram. The weights have been scaled for the sake of display, only relative differences
+            between them have an interpretable meaning.
+            The errors containing "expected" in the name take these weights into account.
+        :return:
+        """
+        colors = ListedColormap(["y", "g", "r", "c", "m"])
+
+        plt.figure()
+        plt.title(f"Reliability curves ({self.bins} bins)")
+        plt.xlabel("confidence")
+        plt.ylabel("ground truth probability")
+        plt.axis("equal")
+        x_values = self._discretized_probab_values
+        plt.plot(
+            np.linspace(0, 1), np.linspace(0, 1), label="perfect calibration", color="b"
+        )
+        for i, class_label in enumerate(class_labels):
+            color = colors(i)
+            if isinstance(class_label, int):
+                label = f"class {class_label}"
+                y_values, weights, _ = self.marginal_reliabilities(class_label)
+            elif class_label == self.TOP_CLASS_LABEL:
+                label = "prediction"
+                y_values, weights, _ = self.top_class_reliabilities()
+            else:
+                raise ValueError(f"Unknown class label: {class_label}")
+            plt.plot(x_values, y_values, marker=".", label=label, color=color)
+            if display_weights:
+                # rescale the weights such that the maximum is at 1/2 for improved visibility
+                weights = 1 / 2 * weights / weights.max()
+                plt.bar(
+                    x_values,
+                    weights,
+                    alpha=0.2,
+                    width=1 / self.bins,
+                    color=color,
+                    label=f"bin_weights for {label}",
+                )
+
+        axes = plt.gca()
+        axes.set_xlim([0, 1])
+        axes.set_ylim([0, 1])
+        plt.legend(loc="best")
+
+    # TODO: delete, I don't think we need this. Maybe add flag to only plot bin weights to the plot above
+    def plot_confidence_distributions(
+        self, class_labels: Sequence[Union[int, str]], new_fig=True
+    ):
+        """
+
+        :param new_fig:
+        :param class_labels:
+        :return:
+        """
+        colors = ListedColormap(["y", "g", "r", "c", "m"])
+
+        if new_fig:
+            plt.figure()
+        plt.title(f" Confidence Distribution ({self.bins} bins)")
+        plt.xlabel("confidence")
+        plt.ylabel("Frequency")
+        x_values = self._discretized_probab_values
+
+        for i, class_label in enumerate(class_labels):
+            color = colors(i)
+            if isinstance(class_label, int):
+                label = f"class {class_label}"
+                _, weights, _ = self.marginal_reliabilities(class_label)
+            elif class_label == self.TOP_CLASS_LABEL:
+                label = "prediction"
+                _, weights, _ = self.top_class_reliabilities()
+            else:
+                raise ValueError(f"Unknown class label: {class_label}")
+            plt.bar(
+                x_values,
+                weights,
+                alpha=0.3,
+                width=1 / self.bins,
+                label=label,
+                color=color,
+            )
+
+        axes = plt.gca()
+        axes.set_xlim([0, 1])
+        plt.legend(loc="best")
+        if new_fig:
+            plt.show()
diff --git a/src/kyle/integrals.py b/src/kyle/integrals.py
new file mode 100644
index 0000000..c20d5de
--- /dev/null
+++ b/src/kyle/integrals.py
@@ -0,0 +1,174 @@
+import logging
+from typing import Callable, Protocol, Sequence
+
+from scipy.integrate import nquad
+from scipy.stats import dirichlet
+
+
+# this is the currently supported way to annotate callables with *args of a certain type,
+# see https://mypy.readthedocs.io/en/latest/protocols.html#callback-protocols
+# hopefully at some point the pycharm type checker will learn to recognize those.
+# I opened an issue for JetBrains: https://youtrack.jetbrains.com/issue/PY-45438
+class Integrand(Protocol):
+    def __call__(self, *parameters: float) -> float:
+        ...
+
+
+def simplex_integral(
+    f: Callable, num_classes: int, boundary_offset=1e-10, coord_sum: float = 1, **kwargs
+):
+    """
+    Performs an integral over num_classes-1 dimensional simplex using scipy
+
+    :param f: function to integrate over the simplex. Should accept num_classes-1 variables
+    :param num_classes: equals dimension of the simplex + 1
+    :param boundary_offset: can be used to prevent numerical errors due to singularities at the simplex' boundary
+    :param coord_sum: sets sum of coordinates of simplex. For standard simplex sum(x1,x2,...) = 1. Mainly useful for
+        simplex_integral_fixed_max
+    :param kwargs: will be passed to scipy.integrate.nquad
+    :return:
+    """
+    if num_classes < 2:
+        raise ValueError("need at least two classes")
+
+    def nested_variable_boundary(*previous_variables: float):
+        """
+        Any variable for the simplex integral goes from zero to coord_sum (usually 1) - sum(all previous variables).
+        See docu of nquad for more details on boundaries
+        """
+        return [
+            0 + boundary_offset,
+            coord_sum - sum(previous_variables) - boundary_offset,
+        ]
+
+    simplex_boundary = [nested_variable_boundary] * (num_classes - 1)
+    # we typically don't need higher precision
+    opts = {"epsabs": 1e-2}
+    opts.update(kwargs.pop("opts", {}))
+    return nquad(f, simplex_boundary, opts=opts, **kwargs)
+
+
+def simplex_integral_fixed_comp(
+    f: Callable, num_classes: int, selected_class: int, x_comp: float, **kwargs
+):
+    """
+    Performs an integral over the subset of a num_classes-1 dimensional simplex defined by the selected_class component
+    of the confidence vector having a fixed value of x_comp, i.e. marginalises out all other classes.
+
+    Computing this involves integrating over a num_classes-2 dimensional non-unit simplex with coord_sum set to 1-x_comp
+    and with the selected_class argument of f being set to x_comp
+
+    :param f: function to integrate over the subset of the simplex. Should accept num_classes-1 variables
+    :param num_classes: equals dimension of the simplex + 1
+    :param selected_class: selected confidence vector component [0, num_classes-1]
+    :param x_comp: fixed value of the selected vector component
+    :param kwargs: passed to simplex_integral
+    :return:
+    """
+
+    if not (0 <= x_comp <= 1):
+        raise ValueError("Confidences have to lie in range (0,1)")
+
+    if selected_class == num_classes - 1:
+
+        def constrained_integrand(*args: float):
+            constrained_args = [1 - x_comp - sum(args[0:]), *args[0:]]
+            return f(*constrained_args)
+
+    else:
+
+        def constrained_integrand(*args: float):
+            constrained_args = [*args[0:selected_class], x_comp, *args[selected_class:]]
+            return f(*constrained_args)
+
+    return simplex_integral(
+        constrained_integrand, num_classes - 1, coord_sum=1 - x_comp, **kwargs
+    )
+
+
+def simplex_integral_fixed_max(f: Callable, num_classes: int, x_max: float, **kwargs):
+    """
+    Performs an integral over the subset of a num_classes-1 dimensional simplex defined by the largest
+    coordinate/confidence having a fixed value of x_max, i.e. marginalises over all possible confidence vectors with
+    maximum confidence of x_max.
+
+    Computing this integral involves computing the sum of num_classes integrals each over a num_classes-2 dimensional
+    simplex. For x_max > 0.5 the integrals are 'true' simplex integrals. For x_max < 0.5 the boundaries become complex
+    and non-simplex like. The integrals can then be extended to full simplex integrals using an appropiate indicator
+    function, ``get_argmax_region_char_function``.
+
+    :param f: function to integrate over the subset of the simplex. Should accept num_classes-1 variables
+    :param num_classes: equals dimension of the simplex + 1
+    :param x_max: fixed value of largest coordinate value. defines subset of simplex
+    :param kwargs: passed to simplex_integral_fixed_comp
+    :return:
+    """
+
+    if not (1 / num_classes < x_max < 1):
+        return 0, 0
+
+    # For small x_max higher precision is required for accurate results as over large ingtegration range integrand is 0
+    # Sets higher precision if precision not already set in **kwargs
+    if x_max < 1 / 2:
+        opts = {"epsabs": 1e-4}
+        opts.update(kwargs.pop("opts", {}))
+        kwargs.update({"opts": opts})
+
+    integral_result = (0, 0)
+
+    for i in range(num_classes):
+
+        argmax_char_func = get_argmax_region_char_function(i)
+
+        constrained_integral = simplex_integral_fixed_comp(
+            lambda *args: argmax_char_func(*args) * f(*args),
+            num_classes,
+            i,
+            x_max,
+            **kwargs,
+        )
+        integral_result = tuple(
+            sum(p) for p in zip(integral_result, constrained_integral)
+        )
+
+    return integral_result
+
+
+def dirichlet_exp_value(f: Callable, alpha: Sequence[float], **kwargs):
+    """
+    Computes expectation value of f over num_classes-1 dimensional simplex using scipy. Note scipy.dirichlet.pdf for
+    n classes accepts n-1 entries as sum(x_n) = 1.
+
+    :param f:
+    :param alpha: the parameters of the dirichlet distribution, one for each class
+    :param kwargs: passed to simplex_integral
+    :return:
+    """
+    num_classes = len(alpha)
+    return simplex_integral(
+        lambda *args: f(*args) * dirichlet.pdf(args, alpha), num_classes, **kwargs
+    )
+
+
+def get_argmax_region_char_function(selected_class: int) -> Integrand:
+    """
+    Returns the char. function for the area in which the selected class is the argmax of the input args.
+    The returned function takes a variable number of floats as input. They represent the first N-1 independent
+    entries of an element of a simplex in N-dimensional space (N classes).
+    """
+
+    def char_function(*args: float):
+        if len(args) < 1:
+            raise ValueError("need at least two classes/one input")
+        if not 0 <= selected_class <= len(args):
+            raise IndexError(
+                f"selected_class {selected_class} out of bound for input of length {len(args)}"
+            )
+        probabilities = list(args) + [1 - sum(args)]
+        class_confidence = probabilities[selected_class]
+        return float(class_confidence == max(probabilities))
+
+    return char_function
+
+
+log = logging.getLogger(__name__)
diff --git a/src/kyle/metrics/__init__.py b/src/kyle/metrics/__init__.py
new file mode 100644
index 0000000..0f3f9b1
--- /dev/null
+++ b/src/kyle/metrics/__init__.py
@@ -0,0 +1 @@
+from .calibration_metrics import ACE, ECE, MCE, BaseCalibrationError
diff --git a/src/kyle/metrics/calibration_metrics.py b/src/kyle/metrics/calibration_metrics.py
new file mode 100644
index 0000000..a059985
--- /dev/null
+++ b/src/kyle/metrics/calibration_metrics.py
@@ -0,0 +1,59 @@
+from abc import ABC, abstractmethod
+
+import netcal.metrics
+import numpy as np
+
+from kyle.util import in_simplex
+
+
+class BaseCalibrationError(ABC):
+    @abstractmethod
+    def _compute(
+        self, confidences: np.ndarray, ground_truth: np.ndarray, **kwargs
+    ) -> float:
+        pass
+
+    def compute(self, confidences: np.ndarray, ground_truth: np.ndarray, **kwargs):
+        if not in_simplex(confidences):
+            raise ValueError("Invalid confidences array")
+        return self._compute(confidences, ground_truth, **kwargs)
+
+    def __str__(self):
+        return self.__class__.__name__
+
+
+class NetcalCalibrationError(BaseCalibrationError):
+    def __init__(self, netcal_metric):
+        """
+        Instance of a netcal metric class, e.g. netcal.metrics.ECE
+        """
+        self.netcal_metric = netcal_metric
+
+    def _compute(
+        self, confidences: np.ndarray, ground_truth: np.ndarray, **kwargs
+    ) -> float:
+        return self.netcal_metric.measure(confidences, ground_truth, **kwargs)
+
+
+class ACE(NetcalCalibrationError):
+    """Average Calibration Error. Wraps around netcal's implementation - for further reading refer to netcal's docs."""
+
+    def __init__(self, bins: int = 10):
+        super(ACE, self).__init__(netcal.metrics.ACE(bins))
+
+
+class ECE(NetcalCalibrationError):
+    """Expected Calibration Error. Wraps around netcal's implementation - for further reading refer to netcal's docs."""
+
+    def __init__(self, bins: int = 10):
+        super().__init__(netcal.metrics.ECE(bins))
+
+
+class MCE(NetcalCalibrationError):
+    """Maximum Calibration Error. Wraps around netcal's implementation - for further reading refer to netcal's docs."""
+
+    def __init__(self, bins: int = 10):
+        super().__init__(netcal.metrics.MCE(bins))
+
+
+# TODO: get rid of this
diff --git a/src/kyle/models/__init__.py b/src/kyle/models/__init__.py
new file mode 100644
index 0000000..fa22db8
--- /dev/null
+++ b/src/kyle/models/__init__.py
@@ -0,0 +1 @@
+from .calibratable_model import CalibratableModel
diff --git a/src/kyle/models/calibratable_model.py b/src/kyle/models/calibratable_model.py
new file mode 100644
index 0000000..dc4af9e
--- /dev/null
+++ b/src/kyle/models/calibratable_model.py
@@ -0,0 +1,50 @@
+from typing import Protocol
+
+import numpy as np
+
+from kyle.calibration.calibration_methods import (
+    BaseCalibrationMethod,
+    TemperatureScaling,
+)
+
+
+class ClassifierProtocol(Protocol):
+    def fit(self, X: np.ndarray, y: np.ndarray):
+        ...
+
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        ...
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        ...
+
+
+class CalibratableModel(ClassifierProtocol):
+    def __init__(
+        self,
+        model: ClassifierProtocol,
+        calibration_method: BaseCalibrationMethod = TemperatureScaling(),
+    ):
+        self.model = model
+        self.calibration_method = calibration_method
+
+    def calibrate(self, X: np.ndarray, y: np.ndarray):
+        uncalibrated_confidences = self.model.predict_proba(X)
+        self.calibration_method.fit(uncalibrated_confidences, y)
+
+    def fit(self, X: np.ndarray, y: np.ndarray):
+        self.model.fit(X, y)
+
+    def predict(self, X: np.ndarray) -> np.ndarray:
+        calibrated_proba = self.predict_proba(X)
+
+        return np.argmax(calibrated_proba, axis=2)
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:
+        uncalibrated_confidences = self.model.predict_proba(X)
+        return self.calibration_method.get_calibrated_confidences(
+            uncalibrated_confidences
+        )
+
+    def __str__(self):
+        return f"{self.__class__.__name__}, method: {self.calibration_method}"
diff --git a/src/kyle/models/resnet.py b/src/kyle/models/resnet.py
new file mode 100644
index 0000000..949c362
--- /dev/null
+++ b/src/kyle/models/resnet.py
@@ -0,0 +1,123 @@
+"""
+Code taken from: https://github.com/akamaster/pytorch_resnet_cifar10
+Proper implementation of ResNet20 for Cifar10. Pytorch only has ResNets for ImageNet which
+differ in number of parameters
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+
+
+def _weights_init(m):
+    if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
+        init.kaiming_normal_(m.weight)
+
+
+class LambdaLayer(nn.Module):
+    def __init__(self, lambd):
+        super(LambdaLayer, self).__init__()
+        self.lambd = lambd
+
+    def forward(self, x):
+        return self.lambd(x)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1, option="A"):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False
+        )
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, stride=1, padding=1, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != planes:
+            if option == "A":
+                """
+                For CIFAR10 ResNet paper uses option A.
+                """
+                self.shortcut = LambdaLayer(
+                    lambda x: F.pad(
+                        x[:, :, ::2, ::2],
+                        (0, 0, 0, 0, planes // 4, planes // 4),
+                        "constant",
+                        0,
+                    )
+                )
+            elif option == "B":
+                self.shortcut = nn.Sequential(
+                    nn.Conv2d(
+                        in_planes,
+                        self.expansion * planes,
+                        kernel_size=1,
+                        stride=stride,
+                        bias=False,
+                    ),
+                    nn.BatchNorm2d(self.expansion * planes),
+                )
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, num_blocks, num_classes=10):
+        super(ResNet, self).__init__()
+        self.in_planes = 16
+
+        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(16)
+        self.layer1 = self._make_layer(block, 16, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 32, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 64, num_blocks[2], stride=2)
+        self.linear = nn.Linear(64, num_classes)
+
+        self.apply(_weights_init)
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = F.avg_pool2d(out, out.size()[3])
+        out = out.view(out.size(0), -1)
+        out = self.linear(out)
+        return out
+
+
+def resnet20():
+    return ResNet(BasicBlock, [3, 3, 3])
+
+
+def resnet56():
+    return ResNet(BasicBlock, [9, 9, 9])
+
+
+def load_weights(weights_path: str, model: ResNet):
+    weights_dict = torch.load(weights_path, map_location=torch.device("cpu"))[
+        "state_dict"
+    ]
+    weights_dict = {
+        key.replace("module.", ""): value for key, value in weights_dict.items()
+    }
+    model.load_state_dict(weights_dict)
diff --git a/src/kyle/sampling/__init__.py b/src/kyle/sampling/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/kyle/sampling/fake_clf.py b/src/kyle/sampling/fake_clf.py
new file mode 100644
index 0000000..36d32c1
--- /dev/null
+++ b/src/kyle/sampling/fake_clf.py
@@ -0,0 +1,395 @@
+from abc import ABC, abstractmethod
+from typing import Sequence, Union
+
+import numpy as np
+import scipy.optimize
+import scipy.stats
+
+from kyle.transformations import IdentitySimplexAut, SimplexAut
+from kyle.util import sample_index
+
+
+class FakeClassifier(ABC):
+    def __init__(
+        self,
+        num_classes: int,
+        simplex_automorphism: SimplexAut = None,
+        check_io=True,
+    ):
+        if num_classes < 1:
+            raise ValueError(f"{self.__class__.__name__} requires at least two classes")
+        self.num_classes = num_classes
+        self._rng = np.random.default_rng()
+
+        self._simplex_automorphism: SimplexAut = None
+        self.set_simplex_automorphism(simplex_automorphism)
+        self.check_io = check_io
+
+    # TODO or not TODO: one could get rid of separate SimplexAut. class in favor of passing a function
+    #   pro: the function is less verbose to write, easier for user; contra: naming and state become more convoluted
+    def set_simplex_automorphism(self, aut: Union[SimplexAut, None]) -> None:
+        """
+        :param aut: if None, the identity automorphism will be set
+        """
+        if aut is None:
+            aut = IdentitySimplexAut(self.num_classes)
+        if aut.num_classes is not None and aut.num_classes != self.num_classes:
+            raise ValueError(f"{aut} has wrong number of classes: {aut.num_classes}")
+        self._simplex_automorphism = aut
+
+    @abstractmethod
+    def sample_confidences(self, n_samples: int) -> np.ndarray:
+        ...
+
+    @property
+    def simplex_automorphism(self):
+        return self._simplex_automorphism
+
+    def get_sample_arrays(self, n_samples: int):
+        """
+        Get arrays with ground truth and predicted probabilities
+
+        :param n_samples:
+        :return: tuple of arrays of shapes (n_samples,), (n_samples, n_classes)
+        """
+        confidences = self.sample_confidences(n_samples)
+        gt_probabilities = self.simplex_automorphism.transform(
+            confidences, check_io=self.check_io
+        )
+        gt_labels = sample_index(gt_probabilities)
+        return gt_labels, confidences
+
+    def __str__(self):
+        return f"{self.__class__.__name__}_{self.simplex_automorphism}"
+
+
+class DirichletFC(FakeClassifier):
+    def __init__(
+        self,
+        num_classes: int,
+        alpha: Sequence[float] = None,
+        simplex_automorphism: SimplexAut = None,
+    ):
+        super().__init__(num_classes, simplex_automorphism=simplex_automorphism)
+
+        self._alpha: np.ndarray = None
+        self.set_alpha(alpha)
+
+    def set_alpha(self, alpha: Union[np.ndarray, None]):
+        """
+        :param alpha: if None, the default value of [1, ..., 1] will be set
+        """
+        if alpha is None:
+            alpha = np.ones(self.num_classes)
+        else:
+            alpha = np.array(alpha)
+            if not alpha.shape == (self.num_classes,):
+                raise ValueError(f"Wrong shape of alpha: {alpha.shape}")
+        self._alpha = alpha
+
+    @property
+    def alpha(self):
+        return self._alpha
+
+    def sample_confidences(self, n_samples: int) -> np.ndarray:
+        return self._rng.dirichlet(self.alpha, size=n_samples)
+
+    def pdf(self, confidences, alpha=None):
+        if alpha is None:
+            alpha = self.alpha
+        return scipy.stats.dirichlet.pdf(confidences.T, alpha)
+
+    def fit(self, confidences, initial_alpha=None, alpha_bounds=None, **kwargs):
+        """
+        Fits the dirichlet fake classifier to the provided confidence distribution using maximum likelihood estimation
+        and sets the fake classifier parameters to the best fit parameters
+
+        :param confidences: Numpy array of shape (num_samples, num_classes);
+                            confidence distribution to fit classifier to
+        :param initial_alpha: Float; Initial guess for fitting alpha parameters
+        :param alpha_bounds: Tuple, (lower_bound, upper_bound); Bounds for fitting alpha parameters. A lower/upper bound
+                             of None corresponds to unbounded parameter
+        :param kwargs: passed to ``scipy.optimize.minimize``
+        :return:
+        """
+        if initial_alpha is None:
+            initial_alpha = self.alpha
+
+        if alpha_bounds is None:
+            alpha_bounds = (0.0001, None)
+
+        # rescale confidences to avoid divergences on sides of simplex and renormalize
+        confidences = (
+            confidences * (confidences.shape[0] - 1) + 1 / self.num_classes
+        ) / confidences.shape[0]
+        confidences = confidences / np.sum(confidences, axis=1)[:, None]
+
+        alpha_bounds = [alpha_bounds] * self.num_classes
+
+        nll = lambda parm: -np.sum(np.log(self.pdf(confidences, parm)))
+        mle_fit = scipy.optimize.minimize(
+            nll, initial_alpha, bounds=alpha_bounds, **kwargs
+        )
+        self.set_alpha(mle_fit.x)
+
+        return mle_fit
+
+
+class MultiDirichletFC(FakeClassifier):
+    """
+    A fake classifier that first draws from a K categorical distribution and based on the result then draws from
+    1 of K Dirichlet Distributions of a restricted form.
+    The K'th Dirichlet Distribution has parameters of the form:  sigma * {1, 1, ..., alpha_k, 1, 1, ...}; alpha > 1
+    where 'alpha_k' is at the k'th position.
+    Effectively a distribution with a maximum of variable position and variable variance in each corner of the simplex
+
+    :param num_classes:
+    :param alpha: numpy array of shape (num_classes,). k'th entry corresponds to alpha_k for the k'th dirichlet
+    :param sigma: numpy array of shape (num_classes,). k'th entry corresponds to sigma for the k'th dirichlet
+    :param distribution_weights: numpy array of shape (num_classes,). Probabilities used for drawing from K Categorical
+    :param simplex_automorphism:
+    """
+
+    def __init__(
+        self,
+        num_classes: int,
+        alpha: Sequence[float] = None,
+        sigma: Sequence[float] = None,
+        distribution_weights: Sequence[float] = None,
+        simplex_automorphism: SimplexAut = None,
+    ):
+        super().__init__(num_classes, simplex_automorphism=simplex_automorphism)
+
+        self._alpha: np.ndarray = None
+        self._sigma: np.ndarray = None
+        self._distribution_weights: np.ndarray = None
+
+        self.set_alpha(alpha)
+        self.set_sigma(sigma)
+        self.set_distribution_weights(distribution_weights)
+
+    @property
+    def alpha(self):
+        return self._alpha
+
+    def set_alpha(self, alpha: Union[np.ndarray, None]):
+        """
+        :param alpha: if None, the default value of [1, ..., 1] will be set.
+        """
+        if alpha is None:
+            alpha = np.ones(self.num_classes)
+        else:
+            alpha = np.array(alpha)
+            if not alpha.shape == (self.num_classes,):
+                raise ValueError(f"Wrong shape of alpha: {alpha.shape}")
+        self._alpha = alpha
+
+    @property
+    def sigma(self):
+        return self._sigma
+
+    def set_sigma(self, sigma: Union[np.ndarray, None]):
+        """
+        :param sigma: if None, the default value of [1, ..., 1] will be set
+        """
+        if sigma is None:
+            sigma = np.ones(self.num_classes)
+        else:
+            sigma = np.array(sigma)
+            if not sigma.shape == (self.num_classes,):
+                raise ValueError(f"Wrong shape of sigma: {sigma.shape}")
+        self._sigma = sigma
+
+    @property
+    def distribution_weights(self):
+        return self._distribution_weights
+
+    def set_distribution_weights(self, distribution_weights: Union[np.ndarray, None]):
+        """
+        :param distribution_weights: if None, the default value of [1/num_classes, ..., 1/num_classes] will be set
+        """
+        if distribution_weights is None:
+            distribution_weights = np.ones(self.num_classes) / self.num_classes
+        else:
+            distribution_weights = np.array(distribution_weights)
+            if not distribution_weights.shape == (self.num_classes,):
+                raise ValueError(
+                    f"Wrong shape of predicted_class_weights: {distribution_weights.shape}"
+                )
+        self._distribution_weights = distribution_weights / np.sum(distribution_weights)
+
+    def get_parameters(self):
+        return self._alpha, self._sigma, self._distribution_weights
+
+    def set_parameters(self, alpha, sigma, distribution_weights):
+        self.set_alpha(alpha)
+        self.set_sigma(sigma)
+        self.set_distribution_weights(distribution_weights)
+
+    def sample_confidences(self, n_samples: int) -> np.ndarray:
+
+        weight_array = np.repeat(self.distribution_weights[None, :], n_samples, axis=0)
+        chosen_distributions = sample_index(weight_array)
+
+        confidences = np.zeros((n_samples, self.num_classes))
+
+        for i, chosen_distribution in enumerate(chosen_distributions):
+            alpha_vector = np.ones(self.num_classes)
+            alpha_vector[chosen_distribution] = self.alpha[chosen_distribution]
+            alpha_vector *= self.sigma[chosen_distribution]
+
+            confidences[i, :] = self._rng.dirichlet(alpha_vector)
+
+        return confidences
+
+    def pdf(self, confidences, alpha=None, sigma=None, distribution_weights=None):
+        """
+        Computes pdf of MultiDirichletFC. Using K categorical distribution to sample from K dirichlet distributions
+        is equivalent to sampling from a pdf that is a weighted sum of the K individual dirichlet pdf's
+        :param confidences: numpy array of shape (num_classes,) or (num_samples, num_classes)
+        :param distribution_weights: numpy array of shape (num_classes,) uses self.distribution_weights if not provided
+        :param sigma: numpy array of shape (num_classes,) uses self.sigma if not provided
+        :param alpha: numpy array of shape (num_classes,) uses self.alpha if not provided
+        """
+
+        if alpha is None:
+            alpha = self.alpha
+        if sigma is None:
+            sigma = self.sigma
+        if distribution_weights is None:
+            distribution_weights = self.distribution_weights
+
+        confidences = confidences.T
+
+        distributions = np.zeros(confidences.shape)
+
+        for i, (a, s) in enumerate(zip(alpha, sigma)):
+            alpha_vector = np.ones(self.num_classes)
+            alpha_vector[i] = a
+            alpha_vector *= s
+
+            distributions[i] = scipy.stats.dirichlet.pdf(confidences, alpha_vector)
+
+        return np.sum(distribution_weights[:, None] * distributions, axis=0) / np.sum(
+            distribution_weights
+        )
+
+    def fit(
+        self,
+        confidences,
+        initial_parameters=None,
+        parameter_bounds=None,
+        simplified_fitting=True,
+        **kwargs,
+    ):
+        """
+        Fits a Multi-Dirichlet fake classifier to the provided confidence distribution using maximum likeihood
+        estimation and sets the fake classifier parameters to the best fit parameters.
+        If simplified_fitting is set to False all parameters of the fake classifier are fit directly via MLE
+        If simplified_fitting is set to True each dirichlet is fit separately. Alpha and Sigma of the k'th dirichlet
+        are fit to the subset of the confidences that predict the k'th class, i.e. for which argmax(c) = k. The
+        distribution weights are not fit, but estimated from the predicted class probabilities of the confidence
+        distribution.
+
+        :param confidences: Numpy array of shape (num_samples, num_classes);
+                            confidence distribution to fit classifier to
+        :param initial_parameters: Numpy array of shape (3,) ((2,) for simplified_fitting=True)
+                       Corresponds to initial guesses for each parameter 'class' alpha, sigma and distribution_weights
+                       If None, uses [1, 1, 1/num_classes]
+        :param parameter_bounds: Sequence of 3 (2 for simplified_fitting=True) tuples (lower_bound, upper_bound)
+                        Corresponds to the bounds on each parameter 'class',  alpha, sigma and distribution_weights
+                        A lower/upper bound of None corresponds to unbounded parameters
+                        If None, uses intervals [(0, + infinity), (0, + infinity), (0,1)]
+        :param simplified_fitting: If False directly fits Multi-Dirichlet FC to confidence distribution
+                                   If True fits each dirichlet separately. Only fits alpha and sigma, not
+                                   distribution_weights
+        :param kwargs: passed to ``scipy.optimize.minimize``
+        :return: If simplfied_fitting=False: scipy OptimizeResult
+                 If simplified_fitting=True: List of num_classes OptimizeResults, one for each separate dirichlet fit
+        """
+
+        # rescale confidences to avoid divergences on sides of simplex and renormalize
+        confidences = (
+            confidences * (confidences.shape[0] - 1) + 1 / self.num_classes
+        ) / confidences.shape[0]
+        confidences = confidences / np.sum(confidences, axis=1)[:, None]
+
+        if not simplified_fitting:
+            if initial_parameters is None:
+                initial_parameters = np.array([1, 1, 1 / self.num_classes])
+            if parameter_bounds is None:
+                # dirichlet distribution undefined for alpha/sigma parameters exactly = 0
+                parameter_bounds = [(0.0001, None)] * 2 + [(0, 1)]
+
+            # scipy requires an initial guess and a bound (lower, upper) for each parameter
+            # not just each parameter class
+            initial_parameters = np.repeat(initial_parameters, self.num_classes)
+            parameter_bounds = [
+                pair for pair in parameter_bounds for i in range(self.num_classes)
+            ]
+
+            nll = lambda parms: -np.sum(
+                np.log(self.pdf(confidences, *np.split(parms, 3)))
+            )
+            mle_fit = scipy.optimize.minimize(
+                nll, initial_parameters, bounds=parameter_bounds
+            )
+            self.set_parameters(*np.split(mle_fit.x, 3))
+
+            return mle_fit
+
+        if simplified_fitting:
+            if initial_parameters is None:
+                initial_parameters = np.array([1, 1])
+            if parameter_bounds is None:
+                # dirichlet distribution undefined for alpha/sigma parameters exactly = 0
+                parameter_bounds = [(0.0001, None)] * 2
+
+            predicted_class = np.argmax(confidences, axis=1)
+            class_split_confidences = [
+                confidences[predicted_class == i, :] for i in range(self.num_classes)
+            ]
+
+            estimated_distribution_weights = [
+                k_class_conf.shape[0] for k_class_conf in class_split_confidences
+            ]
+            estimated_distribution_weights = estimated_distribution_weights / np.sum(
+                estimated_distribution_weights
+            )
+
+            mle_fits = []
+
+            for k, k_class_confidences in enumerate(class_split_confidences):
+
+                def k_dir_nll(alpha_k, sigma_k):
+                    alpha = np.ones(self.num_classes)
+                    alpha[k] = alpha_k
+                    sigma = np.ones(self.num_classes)
+                    sigma[k] = sigma_k
+                    # 'isolate' the k'th dirichlet distribution
+                    distribution_weights = np.zeros(self.num_classes)
+                    distribution_weights[k] = 1
+                    return -np.sum(
+                        np.log(
+                            self.pdf(
+                                k_class_confidences, alpha, sigma, distribution_weights
+                            )
+                        )
+                    )
+
+                k_initial_parameters = initial_parameters
+                k_parameter_bounds = parameter_bounds
+
+                k_dir_mle_fit = scipy.optimize.minimize(
+                    lambda parms: k_dir_nll(*parms),
+                    k_initial_parameters,
+                    bounds=k_parameter_bounds,
+                    **kwargs,
+                )
+                mle_fits.append(k_dir_mle_fit)
+
+            self.set_alpha(np.array([k_mle_fit.x[0] for k_mle_fit in mle_fits]))
+            self.set_sigma(np.array([k_mle_fit.x[1] for k_mle_fit in mle_fits]))
+            self.set_distribution_weights(estimated_distribution_weights)
+
+        return mle_fits
diff --git a/src/kyle/transformations.py b/src/kyle/transformations.py
new file mode 100644
index 0000000..76ae59e
--- /dev/null
+++ b/src/kyle/transformations.py
@@ -0,0 +1,154 @@
+from abc import ABC, abstractmethod
+from typing import Callable, Sequence
+
+import numpy as np
+
+from kyle.util import in_simplex
+
+
+class SimplexAut(ABC):
+    """
+    Base class for all simplex automorphisms
+
+    :param num_classes: The dimension of the simplex vector, equals 1 + (dimension of the simplex as manifold).
+        If provided, will use this for addition I/O checks.
+    """
+
+    def __init__(self, num_classes: int = None):
+        #  Several transformations can be defined without referring to num_classes, which is why it is optional.
+        self.num_classes = num_classes
+
+    def __str__(self):
+        return self.__class__.__name__
+
+    @abstractmethod
+    def _transform(self, x: np.ndarray) -> np.ndarray:
+        """
+        :param x: array of shape (n_samples, n_classes)
+        :return: transformed array of shape (n_samples, n_classes)
+        """
+        pass
+
+    def transform(self, x: np.ndarray, check_io=True) -> np.ndarray:
+        if len(x.shape) == 1:
+            x = x[None, :]
+        if check_io and not in_simplex(x, self.num_classes):
+            raise ValueError(f"Input has to be from a simplex of suitable dimension")
+        x = self._transform(x)
+        if check_io and not in_simplex(x, self.num_classes):
+            raise ValueError(
+                f"Bad implementation: Output has to be from a simplex of suitable dimension"
+            )
+        return x.squeeze()
+
+
+class IdentitySimplexAut(SimplexAut):
+    def _transform(self, x: np.ndarray) -> np.ndarray:
+        return x
+
+
+class SingleComponentSimplexAut(SimplexAut):
+    """
+    A simplex automorphism resulting from the application of a map on the unit interval to a
+    single component of x and normalizing the result.
+
+    :param component: integer in range [0, num_classes - 1], corresponding to the component on which to apply the mapping
+    :param mapping: map from the unit interval [0,1] to itself, should be applicable to arrays
+    :param num_classes: The dimension of the simplex vector, equals 1 + (dimension of the simplex as manifold).
+        If provided, will use this for addition I/O checks.
+    """
+
+    def __init__(
+        self,
+        component: int,
+        mapping: Callable[[np.ndarray], np.ndarray],
+        num_classes: int = None,
+    ):
+        assert (
+            0 <= component < num_classes
+        ), "Selected component should be in the range [0, num_classes - 1]"
+        self.component = component
+        self.mapping = mapping
+        super().__init__(num_classes=num_classes)
+
+    def _transform(self, x: np.ndarray) -> np.ndarray:
+        x = x.copy()
+        x[:, self.component] = self.mapping(x[:, self.component])
+        return x / x.sum(axis=1)[:, None]
+
+
+class MaxComponentSimplexAut(SimplexAut):
+    """
+    A simplex automorphism resulting from the application of a map on the unit interval to a
+    the argmax of x and normalizing the remaining components such that the output vector sums to 1.
+
+    :param mapping: map from the unit interval [0,1] to itself, must be applicable to arrays
+    :param num_classes: The dimension of the simplex vector, equals 1 + (dimension of the simplex as manifold).
+        If provided, will use this for addition I/O checks.
+    """
+
+    def __init__(self, mapping: Callable[[np.ndarray], np.ndarray], num_classes=None):
+        self.mapping = mapping
+        super().__init__(num_classes=num_classes)
+
+    def _transform(self, x: np.ndarray) -> np.ndarray:
+        # this transform has a singularity if one component exactly equals one, so we add a minor "noise"
+        x = x + 1e-10
+        x = x / x.sum(axis=1)[:, None]
+
+        argmax = x.argmax(axis=1)
+        old_values = np.choose(argmax, x.T)
+        new_values = self.mapping(old_values)
+        # the result must sum to 1, so we will rescale the remaining entries of the confidence vectors
+        remaining_comps_normalization = (1 - new_values) / (1 - old_values)
+        new_values_compensated_for_norm = new_values / remaining_comps_normalization
+        np.put_along_axis(
+            x, argmax[:, None], new_values_compensated_for_norm[:, None], axis=1
+        )
+        return x * remaining_comps_normalization[:, None]
+
+
+class PowerLawSimplexAut(SimplexAut):
+    """
+    An automorphism resulting from taking elementwise powers of the inputs with fixed exponents
+    and normalizing the result.
+
+    |
+    | *Intuition*:
+
+    If exponents[j] < exponents[i], then the output will be more shifted towards the j-th direction
+    than the i-th. If all exponents are equal to some number s, then s>1 means a shift towards the boundary
+    of the simplex whereas 0<s<1 means a shift towards the center and s < 0 results in an "antipodal shift".
+
+    :param exponents: sequence of length num_classes
+    """
+
+    def __init__(self, exponents: Sequence[float]):
+        self.exponents = np.array(exponents)
+        super().__init__(len(exponents))
+
+    def _transform(self, x: np.ndarray) -> np.ndarray:
+        x = np.float_power(x, self.exponents)
+        return x / x.sum(axis=1)[:, None]
+
+
+class RestrictedPowerSimplexAut(SimplexAut):
+    """
+    Maybe a bad idea, feels unnatural
+    """
+
+    def __init__(self, exponents: np.ndarray):
+        """
+
+        :param exponents: numpy array of shape (num_classes - 1, )
+        """
+        if not np.all(exponents >= 1):
+            raise ValueError("Only exponents >= 1 are permitted")
+        self.exponents = exponents[None, :]
+        super().__init__(len(exponents) + 1)
+
+    def _transform(self, x: np.ndarray) -> np.ndarray:
+        x = x.copy()
+        x[:, :-1] = np.float_power(x[:, :-1], self.exponents)
+        x[:, -1] = 1 - x[:, :-1].sum(axis=1)
+        return x / x.sum(axis=1)[:, None]
diff --git a/src/kyle/util.py b/src/kyle/util.py
new file mode 100644
index 0000000..2554c86
--- /dev/null
+++ b/src/kyle/util.py
@@ -0,0 +1,63 @@
+from typing import Union
+
+import numpy as np
+from sklearn.metrics import accuracy_score
+
+
+def safe_accuracy_score(y_true: np.ndarray, y_pred: np.ndarray, **kwargs) -> float:
+    """
+    Wrapper around sklearn accuracy store that returns zero for empty sequences of labels
+
+    :param y_true: Ground truth (correct) labels.
+    :param y_pred: Predicted labels, as returned by a classifier.
+    :param kwargs:
+    :return:
+    """
+    if len(y_true) == len(y_pred) == 0:
+        return 0
+    return accuracy_score(y_true, y_pred, **kwargs)
+
+
+def in_simplex(probabilities: np.ndarray, num_classes=None) -> bool:
+    """
+
+    :param probabilities: single vector of probabilities of shape (n_classes,) or multiple
+        vectors as array of shape (n_samples, n_classes)
+    :param num_classes: if provided, will check whether probability vectors have the correct number of classes
+    :return:
+    """
+    if len(probabilities.shape) == 1:
+        probabilities = probabilities[None, :]
+    if num_classes is None:
+        num_classes = probabilities.shape[1]
+
+    return (
+        probabilities.shape[1] == num_classes
+        and np.allclose(np.sum(probabilities, axis=1), 1.0, rtol=0.01)
+        and (probabilities >= 0).all()
+        and (probabilities <= 1).all()
+    )
+
+
+def sample_index(probabilities: np.ndarray) -> Union[int, np.ndarray]:
+    """
+    Sample indices with the input probabilities. This is essentially a vectorized
+    version of np.random.choice
+
+    :param probabilities: single vector of probabilities of shape (n_indices-1,) or multiple
+        vectors as array of shape (n_samples, n_indices-1)
+    :return: index or array of indices
+    """
+    rng = np.random.default_rng()
+    if len(probabilities.shape) == 1:
+        return rng.choice(len(probabilities), p=probabilities)
+    elif len(probabilities.shape) == 2:
+        # this is a vectorized implementation of np.random.choice with inverse transform sampling
+        # see e.g. https://stephens999.github.io/fiveMinuteStats/inverse_transform_sampling.html
+        # and https://stackoverflow.com/questions/47722005/vectorizing-numpy-random-choice-for-given-2d-array-of-probabilities-along-an-a
+        random_uniform = rng.random(len(probabilities))[:, None]
+        return (probabilities.cumsum(axis=1) > random_uniform).argmax(axis=1)
+    else:
+        raise ValueError(
+            f"Unsupported input shape: {probabilities.shape}. Can only be 1 or 2 dimensional."
+        )
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..faa3f53
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,17 @@
+import numpy as np
+import pytest
+
+from kyle.sampling.fake_clf import DirichletFC
+from kyle.transformations import PowerLawSimplexAut
+
+
+@pytest.fixture(scope="module")
+def uncalibrated_samples():
+    faker = DirichletFC(2, simplex_automorphism=PowerLawSimplexAut(np.array([30, 20])))
+    return faker.get_sample_arrays(1000)
+
+
+@pytest.fixture(scope="module")
+def calibrated_samples():
+    faker = DirichletFC(2)
+    return faker.get_sample_arrays(1000)
diff --git a/tests/kale/test_FakeClassifier.py b/tests/kale/test_FakeClassifier.py
deleted file mode 100644
index cb5ad45..0000000
--- a/tests/kale/test_FakeClassifier.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from kale.sampling import FakeClassifier
-
-
-def test_FakeClassifier():
-    faker = FakeClassifier(3)
-    ground_truth, class_proba = faker.get_sample()
-    assert ground_truth in [0, 1, 2]
-    assert len(class_proba) == 3
diff --git a/tests/kyle/calibration/calibration_methods/test_calibration_methods.py b/tests/kyle/calibration/calibration_methods/test_calibration_methods.py
new file mode 100644
index 0000000..041d140
--- /dev/null
+++ b/tests/kyle/calibration/calibration_methods/test_calibration_methods.py
@@ -0,0 +1,26 @@
+import pytest
+
+from kyle.calibration.calibration_methods import TemperatureScaling
+from kyle.metrics import ECE
+
+
+@pytest.fixture(scope="module")
+def metric():
+    return ECE()
+
+
+@pytest.fixture(scope="module")
+def calibration_method():
+    return TemperatureScaling()
+
+
+def test_methods_calibrationErrorLessAfterCalibration(
+    metric, uncalibrated_samples, calibration_method
+):
+    ground_truth, confidences = uncalibrated_samples
+    error_pre_calibration = metric.compute(confidences, ground_truth)
+    calibration_method.fit(confidences, ground_truth)
+    calibrated_confidences = calibration_method.get_calibrated_confidences(confidences)
+    error_post_calibration = metric.compute(calibrated_confidences, ground_truth)
+
+    assert error_post_calibration <= error_pre_calibration
diff --git a/tests/kyle/calibration/test_model_calibrator.py b/tests/kyle/calibration/test_model_calibrator.py
new file mode 100644
index 0000000..1f6732a
--- /dev/null
+++ b/tests/kyle/calibration/test_model_calibrator.py
@@ -0,0 +1,54 @@
+import pytest
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+from sklearn.neural_network import MLPClassifier
+
+from kyle.calibration import ModelCalibrator
+from kyle.metrics import ECE
+from kyle.models import CalibratableModel
+
+
+@pytest.fixture(scope="module")
+def dataset():
+    X, y = datasets.make_classification(
+        n_samples=2000,
+        n_features=20,
+        n_informative=7,
+        n_redundant=10,
+        n_classes=2,
+        random_state=42,
+    )
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    return X_train, X_test, y_train, y_test
+
+
+@pytest.fixture(scope="module")
+def uncalibrated_model():
+    return MLPClassifier(hidden_layer_sizes=(50, 50, 50))
+
+
+@pytest.fixture(scope="module")
+def calibratable_model(uncalibrated_model):
+    return CalibratableModel(uncalibrated_model)
+
+
+@pytest.fixture(scope="module")
+def calibrator(dataset):
+    X_train, X_val, y_train, y_val = dataset
+    calibrator = ModelCalibrator(X_val, y_val, X_fit=X_train, y_fit=y_train)
+    return calibrator
+
+
+def test_calibrator_integrationTest(calibrator, calibratable_model):
+    calibrator.calibrate(calibratable_model, fit=True)
+    metric = ECE()
+    predicted_probas = calibratable_model.model.predict_proba(calibrator.X_calibrate)
+    calibrated_predicted_probas = calibratable_model.predict_proba(
+        calibrator.X_calibrate
+    )
+    assert metric.compute(
+        calibrated_predicted_probas, calibrator.y_calibrate
+    ) < metric.compute(predicted_probas, calibrator.y_calibrate)
diff --git a/tests/kyle/metrics/test_metrics.py b/tests/kyle/metrics/test_metrics.py
new file mode 100644
index 0000000..0b1a658
--- /dev/null
+++ b/tests/kyle/metrics/test_metrics.py
@@ -0,0 +1,23 @@
+import pytest
+
+from kyle.metrics import ACE, ECE, MCE
+
+
+@pytest.fixture(scope="module")
+def metrics():
+    criteria = [ECE(), MCE(), ACE()]
+    return criteria
+
+
+def test_metrics_calibratedConfidencesHaveZeroError(metrics, calibrated_samples):
+    ground_truth, confidences = calibrated_samples
+    for criterion in metrics:
+        epsilon = 0.1
+        assert criterion.compute(confidences, ground_truth) <= epsilon
+
+
+def test_metrics_uncalibratedConfidencesHaveNonZeroError(metrics, uncalibrated_samples):
+    ground_truth, confidences = uncalibrated_samples
+    for criterion in metrics:
+        epsilon = 0.1
+        assert criterion.compute(confidences, ground_truth) > epsilon
diff --git a/tests/kyle/sampling/test_fake_clf.py b/tests/kyle/sampling/test_fake_clf.py
new file mode 100644
index 0000000..ab0ba86
--- /dev/null
+++ b/tests/kyle/sampling/test_fake_clf.py
@@ -0,0 +1,11 @@
+from kyle.sampling.fake_clf import DirichletFC
+from kyle.util import in_simplex
+
+
+def test_DirichletFC_basics():
+    faker = DirichletFC(3)
+    ground_truth, class_proba = faker.get_sample_arrays(10)
+    assert ground_truth.shape == (10,)
+    assert class_proba.shape == (10, 3)
+    assert ground_truth[0] in [0, 1, 2]
+    assert in_simplex(class_proba)
diff --git a/tests/kyle/test_util.py b/tests/kyle/test_util.py
new file mode 100644
index 0000000..ba0d2dc
--- /dev/null
+++ b/tests/kyle/test_util.py
@@ -0,0 +1,37 @@
+import numpy as np
+
+from kyle.util import in_simplex
+
+
+def test_in_simplex_negativeEntriesForbidden():
+    assert not in_simplex(np.array([0.5, -0.5]))
+
+
+def test_in_simplex_larger1Forbidden():
+    assert not in_simplex(np.array([0, 2]))
+
+
+def test_in_simplex_sumNot1Forbidden():
+    assert not in_simplex(np.array([0.4, 0.7]))
+    assert not in_simplex(np.array([0.1, 0.1]))
+    assert not in_simplex(np.random.default_rng().random((5, 3)))
+
+
+def test_in_simplex_wrongSizeForbidden():
+    assert not in_simplex(np.array([1]), num_classes=2)
+    assert not in_simplex(np.array([1, 0, 0]), num_classes=2)
+    assert not in_simplex(np.random.default_rng().random((5, 3)), num_classes=2)
+
+
+def test_in_simplex_correctInputIsCorrect():
+    assert in_simplex(np.array([0.5, 0.5]), num_classes=2)
+    x = np.random.default_rng().random(5)
+    assert in_simplex(x / x.sum())
+
+
+def test_in_simplex_correct2DInputIsCorrect():
+    x = np.random.default_rng().random((5, 3))
+    row_sums = x.sum(axis=1)
+    x = x / row_sums[:, np.newaxis]
+    assert in_simplex(x)
+    assert in_simplex(x, num_classes=3)
diff --git a/tox.ini b/tox.ini
index 99399ea..11cad40 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,21 +1,64 @@
-# tox (https://tox.readthedocs.io/) is a tool for running tests in multiple virtualenvs.
-# To use it, "pip install tox" and then run "tox" from this directory.to
-
 [tox]
-envlist = py, docs
+envlist = py, docs, linting, report
 isolated_build = True
 
 [testenv]
-extras =
-    test
+# pytest-cov has an issue when the tests are inside an sdist, as created by tox by default
+# despite tests being run, coverage discovers no data, leading to: Coverage.py warning: No data was collected
+# this issue is resolved by running pytest-cov within tox development mode, thus not creating an sdist
+usedevelop = true
 commands =
+    coverage erase
+    pytest --cov --cov-append --cov-report=term-missing tests
+    pytest -n 4 notebooks
+deps =
     pytest
+    pytest-cov
+    pytest-xdist
+    pytest-lazy-fixture
+    jupyter==1.0.0
+    nbconvert==5.6.1
+    -rrequirements.txt
+
+[testenv:linting]
+skip_install = true
+commands =
+    black --check .
+    isort . --check --diff
+    bash -c \'python build_scripts/run_pylint.py >>>(pylint-json2html -f jsonextended -o pylint.html) \'
+deps =
+    pylint
+    anybadge
+    pylint-json2html
+    black
+    isort
+whitelist_externals =
+    bash
 
 [testenv:docs]
-extras =
-    docs
+; NOTE: we don't use pytest for running the doctest, even though with pytest no imports have to be written in them
+; The reason is that we want to be running doctest during the docs build (which might happen on a remote machine,
+; like read_the_docs does) with possibly fewer external dependencies and use sphinx' ability to automock the missing ones.
 commands =
-    python scripts/update_docs.py
+    python build_scripts/update_docs.py
     git add docs/*
     sphinx-build -W -b html -d "{envtmpdir}/doctrees" docs "docs/_build/html"
     sphinx-build -b doctest -d "{envtmpdir}/doctrees" docs "docs/_build/doctest"
+deps =
+    Sphinx==3.2.1
+    sphinxcontrib-websupport==1.2.4
+    sphinx_rtd_theme
+    nbsphinx
+    ipython
+whitelist_externals =
+    git
+
+[testenv:report]
+commands =
+    coverage html
+    coverage-badge -o badges/coverage.svg -f
+    coverage erase
+deps =
+    coverage
+    coverage-badge
+skip_install = true