feat: add readme + examples

ClashLuke · ClashLuke · commit d7f3ebd612dd · 2022-07-14T19:54:20.000+02:00
diff --git a/README.md b/README.md
@@ -0,0 +1,129 @@
+# TPUtils
+
+Babysit your preemptible TPUs - in python
+
+## Usage
+
+### Long-running preemptible training
+
+For example, the following code can be used to create a production-ready v3-256 using
+the [HomebrewNLP-Jax](https://github.com/HomebrewNLP/HomebrewNLP-Jax) codebase (
+see [examples/pod.py](https://github.com/clashluke/tputils/blob/main/examples/pod.py) for an executable version):
+
+```PYTHON
+import dataclasses
+import typing
+from netrc import netrc
+
+import wandb
+import yaml
+
+from tputils import exec_command, exec_on_tpu, send_to_tpu, start_single
+
+_, _, wandb_key = netrc().authenticators("api.wandb.ai")
+
+
+@dataclasses.dataclass
+class Context:
+    retry: int
+
+
+ZONE = "europe-west4-a"
+HOST = "big-pod"
+RUN_NAME = "256-core-tpu"
+
+
+def load_config(ctx: Context):
+    with open("config.yaml", 'r') as f:
+        config = f.read()
+    config = yaml.safe_load(config)
+
+    wandb_api = wandb.Api()
+    config["training"]["do_checkpoint"] = True
+    base_checkpoint_path = config["training"]["checkpoint_path"]
+
+    start_step = 0
+    for run in wandb_api.runs(f"{config['wandb']['entity']}/{config['wandb']['project']}"):
+        if run.name == config['wandb']['name']:
+            start_step = run.summary["_step"]
+            break
+    start_step -= start_step % config["training"]["checkpoint_interval"]
+
+    config["training"]["start_step"] = start_step
+    config["wandb"]["name"] = f"{RUN_NAME}-{ctx.retry}"
+    if ctx.retry > 0:
+        config["training"]["checkpoint_load_path"] = config["training"]["checkpoint_path"]
+    config["training"]["checkpoint_path"] = f"{base_checkpoint_path}-{ctx.retry}"
+    return yaml.dump(config)
+
+
+def start_fn(ctx: Context, worker: int):
+    """
+    This function gets executed in threads to start a run on a new TPU. It receives the context object returned by 
+    `creation_callback` as well as the worker id which corresponds to the slice id this code was executed on in a 
+    multi-host setup. For single-host setups, such as v3-8s, the "worker" will always be set to 0.
+    Ideally, it'd copy necessary files to the TPU and then run those. Here, `exec_command` can be used to create an 
+    execution command that automatically spawns a `screen` session which persists even when the SSH connection gets cut.
+    """
+    send_to_tpu(ZONE, HOST, "config.yaml", load_config(ctx), worker)
+    cmd = exec_command(repository="https://github.com/HomebrewNLP/HomebrewNLP-Jax", wandb_key=wandb_key)
+    send_to_tpu(ZONE, HOST, "setup.sh", cmd, worker)
+    exec_on_tpu(ZONE, HOST, "bash setup.sh", worker)
+
+
+def creation_callback(host: str, ctx: typing.Optional[Context]) -> Context:
+    """
+    The `creation_callback` is called once whenever a new TPU gets created and can be used to persist state
+    (such as retry counters) across multiple invocations.
+    """
+    if ctx is None:  # first invocation
+        return Context(0)
+    ctx.retry += 1
+    return ctx
+
+
+def main(service_account: str, tpu_version: int = 3, slices: int = 32, preemptible: bool = True):
+    start_single(host=HOST, tpu_version=tpu_version, zone=ZONE, preemptible=preemptible,
+                 service_account=service_account, slices=slices, start_fn=start_fn,
+                 creation_callback=creation_callback)
+```
+
+### Sweeps
+
+Similarly, large swarms of instances can be launched trivially using TPUtils. Here, we largely do the same setup as
+above, but call `launch_multiple` instead of `launch_single` which takes the additional argument `tpus` specifying the
+number of TPUs that should be launched and babysit. Depending on capacity and quota, the actual number of TPUs you get
+might be lower than the number of TPUs specified.
+
+```PYTHON
+def main(service_account: str, tpus: int, tpu_version: int = 3, slices: int = 32, preemptible: bool = True):
+    start_multiple(prefix=HOST, tpu_version=tpu_version, zone=ZONE, preemptible=preemptible,
+                   service_account=service_account, slices=slices, start_fn=start_fn,
+                   creation_callback=creation_callback, tpus=tpus)
+```
+
+However, this would simply launch the same run many times. If you instead plan to register them with a
+[WandB Sweep](https://docs.wandb.ai/guides/sweeps/configuration), we need to modify the `start_fn` to join the wandb
+sweep.\
+By patching in the code below, TPUtils will start and maintain a large swarm of TPUs all working towards the same
+hyperparameter optimization problem.
+
+```PYTHON
+with open("sweep.yaml", 'r') as f:  # sweep config passed straight to wandb
+    config = yaml.safe_load(f.read())
+sweep_id = wandb.sweep(config, entity="homebrewnlp", project="gpt")
+
+
+def start_fn(ctx: Context, worker: int):
+    cmd = exec_command(repository="https://github.com/HomebrewNLP/HomebrewNLP-Jax", wandb_key=wandb_key,
+                       run_command=f"/home/ubuntu/.local/bin/wandb agent {sweep_id}")
+    send_to_tpu(ZONE, HOST, "setup.sh", cmd, worker)
+    exec_on_tpu(ZONE, HOST, "bash setup.sh", worker)
+```
+
+The full executable code can be found in [examples/sweep.py](https://github.com/clashluke/tputils/blob/main/examples/sweep.py).
+
+Similarly, the `start_fn` could be adapted to start an inference server
+for [HomebrewNLP](https://github.com/HomebrewNLP/HomebrewNLP-Jax/)
+or [Craiyon](https://huggingface.co/spaces/dalle-mini/dalle-mini) or even execute machine learning unit-tests in
+parallel. 
diff --git a/build.sh b/build.sh
@@ -0,0 +1,3 @@
+rm -rf dist/*
+python3 setup.py sdist bdist_wheel
+twine upload dist/*
diff --git a/examples/pod.py b/examples/pod.py
@@ -0,0 +1,107 @@
+import argparse
+import dataclasses
+import typing
+from netrc import netrc
+
+import wandb
+import yaml
+
+from tputils import exec_command, exec_on_tpu, send_to_tpu, start_single, synchronous_deletion
+
+_, _, wandb_key = netrc().authenticators("api.wandb.ai")
+
+
+@dataclasses.dataclass
+class Context:
+    retry: int
+    zone: str
+    host: str
+    branch: str
+    run_name: str
+    data_path: str
+    config_path: str
+
+
+def load_config(ctx: Context):
+    with open(ctx.config_path, 'r') as f:
+        config = f.read()
+    config = yaml.safe_load(config)
+
+    wandb_api = wandb.Api()
+    config["training"]["do_checkpoint"] = True
+    base_checkpoint_path = config["training"]["checkpoint_path"]
+
+    start_step = 0
+    for run in wandb_api.runs(f"{config['wandb']['entity']}/{config['wandb']['project']}"):
+        if run.name == config['wandb']['name']:
+            start_step = run.summary["_step"]
+            break
+    start_step -= start_step % config["training"]["checkpoint_interval"]
+
+    config["training"]["start_step"] = start_step
+    config["data"]["path"] = ctx.data_path
+    config["wandb"]["name"] = f"{ctx.run_name}-{ctx.retry}"
+    if ctx.retry > 0:
+        config["training"]["checkpoint_load_path"] = config["training"]["checkpoint_path"]
+    config["training"]["checkpoint_path"] = f"{base_checkpoint_path}-{ctx.retry}"
+    return yaml.dump(config)
+
+
+def start_fn(ctx: Context, worker: int):
+    """
+    This function gets executed in threads to start a run on a new TPU. It receives the context object returned by
+    `creation_callback` as well as the worker id which corresponds to the slice id this code was executed on in a
+    multi-host setup. For single-host setups, such as v3-8s, the "worker" will always be set to 0.
+    Ideally, it'd copy necessary files to the TPU and then run those. Here, `exec_command` can be used to create an
+    execution command that automatically spawns a `screen` session which persists even when the SSH connection gets cut.
+    """
+    send_to_tpu(ctx.zone, ctx.host, "config.yaml", load_config(ctx), worker)
+    cmd = exec_command(repository="https://github.com/HomebrewNLP/HomebrewNLP-Jax", wandb_key=wandb_key,
+                       branch=ctx.branch)
+    send_to_tpu(ctx.zone, ctx.host, "setup.sh", cmd, worker)
+    exec_on_tpu(ctx.zone, ctx.host, "bash setup.sh", worker)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, help="Name of the TPU")
+    parser.add_argument("--tpu-version", type=int, default=3, help="Which TPU version to create (v2-8 or v3-8)")
+    parser.add_argument("--zone", type=str, default="europe-west4-a", help="GCP Zone TPUs get created in")
+    parser.add_argument("--data-path", type=str, default="gs://ggpt4/the-char-pile/",
+                        help="Where the data is stored. Should be changed to a bucket in the correct region")
+    parser.add_argument("--preemptible", default=1, type=int,
+                        help="Whether to create preemptible or non-preemptible TPUs")
+    parser.add_argument("--service-account", type=str,
+                        help="Service account that controls permissions of TPU (for example, to ensure EU TPUs "
+                             "won't "
+                             "use US data)")
+    parser.add_argument("--branch", type=str, default="main", help="Branch on github to use")
+    parser.add_argument("--slices", default=1, type=int,
+                        help="How many TPU slices each TPU should have (1=>vX-8, 4=>vX-32)")
+    parser.add_argument("--run-name", type=str, help="Prefix to use for all runs on WandB")
+    parser.add_argument("--config-path", type=str, help="Path to config.yaml")
+    parser.add_argument("--cleanup", default=0, type=int,
+                        help="Instead of running something new, kill all tpus. 1 or 0 for y/n")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    if args.cleanup:
+        synchronous_deletion("", args.host, args.zone)
+        return
+
+    def creation_callback(ctx: typing.Optional[Context]) -> Context:
+        if ctx is None:  # first invocation
+            return Context(retry=0, zone=args.zone, host=args.host, branch=args.branch, run_name=args.run_name,
+                           data_path=args.data_path, config_path=args.config_path)
+        ctx.retry += 1
+        return ctx
+
+    return start_single(args.host, args.tpu_version, args.zone, args.preemptible, args.service_account,
+                        args.slices, start_fn, creation_callback)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/sweep.py b/examples/sweep.py
@@ -0,0 +1,68 @@
+import argparse
+import dataclasses
+import typing
+from netrc import netrc
+
+import wandb
+import yaml
+
+from tputils import delete_all, exec_command, exec_on_tpu, send_to_tpu, start_multiple
+
+_, _, wandb_key = netrc().authenticators("api.wandb.ai")
+
+
+@dataclasses.dataclass
+class Context:
+    zone: str
+    host: str
+    sweep_id: str
+
+
+def start_fn(ctx: Context, worker: int):
+    cmd = exec_command(repository="https://github.com/HomebrewNLP/HomebrewNLP-Jax", wandb_key=wandb_key,
+                       run_command=f"/home/ubuntu/.local/bin/wandb agent {ctx.sweep_id}")
+    send_to_tpu(ctx.zone, ctx.host, "setup.sh", cmd, worker)
+    exec_on_tpu(ctx.zone, ctx.host, "bash setup.sh", worker)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prefix", type=str, help="Prefix used to identify TPUs")
+    parser.add_argument("--tpu-version", type=int, default=3, help="Which TPU version to create (v2-8 or v3-8)")
+    parser.add_argument("--zone", type=str, default="europe-west4-a", help="GCP Zone TPUs get created in")
+    parser.add_argument("--preemptible", default=1, type=int,
+                        help="Whether to create preemptible or non-preemptible TPUs")
+    parser.add_argument("--service-account", type=str,
+                        help="Service account that controls permissions of TPU (for example, to ensure EU TPUs "
+                             "won't use US data)")
+    parser.add_argument("--branch", type=str, default="main", help="Branch on github to use")
+    parser.add_argument("--slices", default=1, type=int,
+                        help="How many TPU slices each TPU should have (1=>vX-8, 4=>vX-32)")
+    parser.add_argument("--config-path", type=str, help="Path to sweep's config.yaml")
+    parser.add_argument("--cleanup", default=0, type=int,
+                        help="Instead of running something new, kill all tpus. 1 or 0 for y/n")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.cleanup:
+        return delete_all(args.prefix, args.zone)
+
+    with open(args.config_path, 'r') as f:
+        config = yaml.safe_load(f.read())
+    sweep_id = wandb.sweep(config, entity="homebrewnlp", project="gpt")
+
+    def creation_callback(host: str, ctx: typing.Optional[Context]) -> Context:
+        if ctx is None:
+            return Context(zone=args.zone, host=host, sweep_id=sweep_id)
+        return ctx
+
+    return start_multiple(args.host, args.tpu_version, args.zone, args.preemptible, args.service_account,
+                          args.slices, start_fn, creation_callback, args.tpus)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/setup.py b/setup.py
@@ -0,0 +1,33 @@
+import setuptools
+
+
+with open('README.md') as f:
+    README = f.read()
+
+setuptools.setup(
+    author="Lucas Nestler",
+    author_email="github.tputils@nestler.sh",
+    name='revlib',
+    license='BSD',
+    description=' Babysit your preemptible TPUs - in python ',
+    version='0.0.1',
+    long_description=README,
+    url='https://github.com/clashluke/revlib',
+    packages=setuptools.find_packages(),
+    python_requires=">=3.7",
+    long_description_content_type="text/markdown",
+    install_requires=[],
+    classifiers=[
+        # Trove classifiers
+        # (https://pypi.python.org/pypi?%3Aaction=list_classifiers)
+        'Development Status :: 5 - Production/Stable',
+        'License :: OSI Approved :: BSD License',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+        'Intended Audience :: Developers',
+    ],
+)
diff --git a/tputils/__init__.py b/tputils/__init__.py
@@ -124,7 +124,7 @@ def recreate(host: str, zone: str, tpu_version: int, preemptible: bool, service_
 
 def start_single(host: str, tpu_version: int, zone: str, preemptible: bool, service_account: str, slices: int,
                  start_fn: typing.Callable[[typing.Any, int], None],
-                 created_callback: typing.Callable[[typing.Any], typing.Any],
+                 creation_callback: typing.Callable[[str, typing.Any], typing.Any],
                  creation_semaphore: typing.Optional[typing.ContextManager] = None):
     _, _, wandb_key = netrc.netrc().authenticators("api.wandb.ai")
 
@@ -136,7 +136,7 @@ def start_single(host: str, tpu_version: int, zone: str, preemptible: bool, serv
         try:
             with creation_semaphore:
                 recreate(host, zone, tpu_version, preemptible, service_account, slices)
-            ctx = created_callback(ctx)
+            ctx = creation_callback(host, ctx)
             threads = [threading.Thread(target=start_fn, args=(ctx, i)) for i in range(slices)]
             for t in threads:
                 t.start()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+rm -rf dist/*`
	`2`	`+python3 setup.py sdist bdist_wheel`
	`3`	`+twine upload dist/*`