pymc-devs · williambdean · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 16, 2024
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,24 @@
+name: pymc-examples
+channels:
+- conda-forge
+dependencies:
+- python=3.11
+- pymc
+- pymc-bart
+- nutpie
+# spatial notebooks
+- geopandas
+- folium
+- libpysal
+- rasterio
+- pip:
+  - pymc-experimental
+  - preliz
+  - bambi
+  - jax
+  - papermill
+  - joblib
+  - jupyter
+  - seaborn
+  - watermark
+  - lifelines
diff --git a/scripts/run_notebooks/injected.py b/scripts/run_notebooks/injected.py
@@ -0,0 +1,74 @@
+"""Injected code to the top of each notebook to mock long running code."""
+
+import os
+import numpy as np
+import pymc as pm
+import xarray as xr
+
+
+def mock_sample(*args, **kwargs):
+    if len(args) > 0:
+        draws = args[0]
+    else:
+        draws = kwargs.get("draws", 1000)
+    random_seed = kwargs.get("random_seed", None)
+    rng = np.random.default_rng(random_seed)
+    model = kwargs.get("model", None)
+    chains = kwargs.get("chains", os.cpu_count())
+    idata = pm.sample_prior_predictive(
+        model=model,
+        random_seed=random_seed,
+        samples=draws,
+    )
+    n_chains = chains
+    expanded_chains = xr.DataArray(
+        np.ones(n_chains),
+        coords={"chain": np.arange(n_chains)},
+    )
+    idata.add_groups(
+        posterior=(idata.prior.mean("chain") * expanded_chains).transpose("chain", "draw", ...)
+    )
+    idata.posterior.attrs["sampling_time"] = 1.0
+
+    if "prior" in idata:
+        del idata.prior
+    if "prior_predictive" in idata:
+        del idata.prior_predictive
+
+    # Create mock sample stats with diverging data
+    if "sample_stats" not in idata:
+        n_chains = chains
+        n_draws = draws
+        sample_stats = xr.Dataset(
+            {
+                "diverging": xr.DataArray(
+                    np.zeros((n_chains, n_draws), dtype=int),
+                    dims=("chain", "draw"),
+                ),
+                "energy": xr.DataArray(
+                    rng.normal(loc=150, scale=2.5, size=(n_chains, n_draws)),
+                    dims=("chain", "draw"),
+                ),
+                "tree_depth": xr.DataArray(
+                    rng.choice([1, 2, 3], p=[0.01, 0.86, 0.13], size=(n_chains, n_draws)),
+                    dims=("chain", "draw"),
+                ),
+                "acceptance_rate": xr.DataArray(
+                    rng.beta(0.5, 0.5, size=(n_chains, n_draws)),
+                    dims=("chain", "draw"),
+                ),
+                # Different sampler
+                "accept": xr.DataArray(
+                    rng.choice([0, 1], size=(n_chains, n_draws)),
+                    dims=("chain", "draw"),
+                ),
+            }
+        )
+        idata.add_groups(sample_stats=sample_stats)
+
+    return idata
+
+
+pm.sample = mock_sample
+pm.HalfFlat = pm.HalfNormal
+pm.Flat = pm.Normal
diff --git a/scripts/run_notebooks/runner.py b/scripts/run_notebooks/runner.py
@@ -0,0 +1,219 @@
+"""CLI to notebook or directory of notebooks.
+
+Arguments
+---------
+--notebooks: Specific notebook or directory of notebooks to run.
+--mock: Run notebooks with mock code. Default is True. If --no-mock is provided,
+    notebooks will run without mock code.
+
+Examples
+--------
+Run all notebooks in a directory with mock code:
+
+.. code-block:: bash
+
+    python scripts/run_notebooks/runner.py --notebooks notebooks/ --mock
+
+Run a single notebook without mocked code:
+
+.. code-block:: bash
+
+    python scripts/run_notebooks/runner.py --notebooks notebooks/notebook.ipynb --no-mock
+
+Run all the notebook is two different directories with mocked code (default):
+
+.. code-block:: bash
+
+    python scripts/run_notebooks/runner.py --notebooks notebooks/ notebooks2/
+
+"""
+
+from argparse import ArgumentParser
+
+from rich.console import Console
+from dataclasses import dataclass
+import logging
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import TypedDict
+from uuid import uuid4
+
+import papermill
+from joblib import Parallel, delayed
+from nbformat.notebooknode import NotebookNode
+from papermill.iorw import load_notebook_node, write_ipynb
+
+KERNEL_NAME: str = "python3"
+
+HERE = Path(__file__).parent
+INJECTED_CODE_FILE = HERE / "injected.py"
+INJECTED_CODE = INJECTED_CODE_FILE.read_text()
+
+
+def setup_logging() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    )
+
+
+def generate_random_id() -> str:
+    return str(uuid4())
+
+
+def inject_pymc_sample_mock_code(cells: list) -> None:
+    cells.insert(
+        0,
+        NotebookNode(
+            id=f"code-injection-{generate_random_id()}",
+            execution_count=sum(map(ord, "Mock pm.sample")),
+            cell_type="code",
+            metadata={"tags": []},
+            outputs=[],
+            source=INJECTED_CODE,
+        ),
+    )
+
+
+def mock_run(notebook_path: Path, i: int, total: int) -> None:
+    nb = load_notebook_node(str(notebook_path))
+    inject_pymc_sample_mock_code(nb.cells)
+    with NamedTemporaryFile(suffix=".ipynb") as f:
+        write_ipynb(nb, f.name)
+        desc = f"({i} / {total}) Mocked {notebook_path.name}"
+        papermill.execute_notebook(
+            input_path=f.name,
+            output_path=None,
+            progress_bar=dict(desc=desc),
+            kernel_name=KERNEL_NAME,
+            cwd=notebook_path.parent,
+        )
+
+
+def actual_run(notebook_path: Path, i: int, total: int) -> None:
+    papermill.execute_notebook(
+        input_path=notebook_path,
+        output_path=None,
+        kernel_name=KERNEL_NAME,
+        progress_bar={"desc": f"({i} / {total}) Running {notebook_path.name}"},
+        cwd=notebook_path.parent,
+    )
+
+
+@dataclass
+class NotebookSuccess:
+    notebook_path: Path
+
+
+@dataclass
+class NotebookFailure:
+    notebook_path: Path
+    error: str
+
+
+def run_notebook(
+    notebook_path: Path,
+    i: int,
+    total: int,
+    mock: bool = True,
+) -> NotebookFailure | NotebookSuccess:
+    logging.info(f"Running notebook: {notebook_path.name}")
+    run = mock_run if mock else actual_run
+
+    try:
+        run(notebook_path, i=i, total=total)
+    except Exception as e:
+        logging.error(f"{e.__class__.__name__} encountered running notebook: {str(notebook_path)}")
+        return NotebookFailure(notebook_path=notebook_path, error=str(e))
+    else:
+        return NotebookSuccess(notebook_path=notebook_path)
+
+
+class RunParams(TypedDict):
+    notebook_path: Path
+    mock: bool
+    i: int
+    total: int
+
+
+def run_parameters(notebook_paths: list[Path], mock: bool = True) -> list[RunParams]:
+    def to_mock(notebook_path: Path, i: int) -> RunParams:
+        return RunParams(
+            notebook_path=notebook_path,
+            mock=mock,
+            i=i,
+            total=len(notebook_paths),
+        )
+
+    return [to_mock(notebook_path, i=i) for i, notebook_path in enumerate(notebook_paths, start=1)]
+
+
+def main(notebooks_to_run: list[Path], mock: bool = True) -> None:
+    console = Console()
+    setup_logging()
+    logging.info("Starting notebook runner")
+    logging.info(f"Running {len(notebooks_to_run)} notebook(s).")
+    results = Parallel(n_jobs=-1)(
+        delayed(run_notebook)(**run_params)
+        for run_params in run_parameters(notebooks_to_run, mock=mock)
+    )
+    errors: list[NotebookFailure] = list(filter(lambda x: isinstance(x, NotebookFailure), results))
+    successes: list[NotebookSuccess] = list(
+        filter(lambda x: isinstance(x, NotebookSuccess), results)
+    )
+
+    if not errors:
+        logging.info("All notebooks ran successfully!")
+        return
+
+    for error in errors:
+        console.rule(f"[bold red]Error running {error.notebook_path}[/bold red]")
+        console.print(error.error)
+
+    for success in successes:
+        console.print(f"[bold green]Success running {success.notebook_path}[/bold green]")
+
+    logging.error(f"{len(errors)} / {len(notebooks_to_run)} notebooks failed")
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--notebooks",
+        nargs="+",
+        help="List of notebooks to run. If not provided, all notebooks will be run.",
+    )
+    mock_group = parser.add_mutually_exclusive_group()
+    mock_group.add_argument(
+        "--mock",
+        action="store_true",
+        help="Run notebooks with mock code",
+        dest="mock",
+    )
+    mock_group.add_argument(
+        "--no-mock",
+        action="store_false",
+        help="Run notebooks without mock code",
+        dest="mock",
+    )
+    parser.set_defaults(mock=True)
+    args = parser.parse_args()
+
+    notebooks_to_run = []
+    notebooks = args.notebooks
+    notebooks = [Path(notebook) for notebook in notebooks]
+    for notebook in notebooks:
+        if notebook.is_dir():
+            notebooks_to_run.extend(notebook.glob("*.ipynb"))
+            notebooks_to_run.extend(notebook.glob("*/*.ipynb"))
+        else:
+            notebooks_to_run.append(notebook)
+
+    args.notebooks = notebooks_to_run
+
+    return args
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args.notebooks, mock=args.mock)