diff --git a/tutorials/alcf/02_ThetaGPU_ray/README.rst b/tutorials/alcf/02_ThetaGPU_ray/README.rst index faa051c..24c2508 100644 --- a/tutorials/alcf/02_ThetaGPU_ray/README.rst +++ b/tutorials/alcf/02_ThetaGPU_ray/README.rst @@ -25,7 +25,7 @@ Start by creating a script named ``activate-dhenv.sh`` to initialize your enviro . /etc/profile # Tensorflow optimized for A100 with CUDA 11 - module load conda/2021-11-30 + module load conda/2022-07-01 # Activate conda env conda activate $CONDA_ENV_PATH diff --git a/tutorials/alcf/03_ThetaGPU_mpi/README.rst b/tutorials/alcf/03_ThetaGPU_mpi/README.rst index bac2b1c..e3c3fa8 100644 --- a/tutorials/alcf/03_ThetaGPU_mpi/README.rst +++ b/tutorials/alcf/03_ThetaGPU_mpi/README.rst @@ -25,7 +25,7 @@ Start by creating a script named ``activate-dhenv.sh`` to initialize your enviro . /etc/profile # Tensorflow optimized for A100 with CUDA 11 - module load conda/2021-11-30 + module load conda/2022-07-01 # Activate conda env conda activate $CONDA_ENV_PATH diff --git a/tutorials/alcf/04_ThetaGPU_jupyter/README.rst b/tutorials/alcf/04_ThetaGPU_jupyter/README.rst index 2d72a2c..94486a4 100644 --- a/tutorials/alcf/04_ThetaGPU_jupyter/README.rst +++ b/tutorials/alcf/04_ThetaGPU_jupyter/README.rst @@ -21,7 +21,7 @@ After logging in Theta: .. code-block:: console $ . /etc/profile - $ module load conda/2021-09-22 + $ module load conda/2022-07-01 $ conda activate $CONDA_ENV_PATH 3. Then, start the Jupyter notebook server: diff --git a/tutorials/notebooks/09_HPS_CORDS/tutorial_09.ipynb b/tutorials/notebooks/09_HPS_CORDS/tutorial_09.ipynb new file mode 100644 index 0000000..bacea2a --- /dev/null +++ b/tutorials/notebooks/09_HPS_CORDS/tutorial_09.ipynb @@ -0,0 +1,490 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d9d7beb5-0894-4264-a3d6-a3790e9673ca", + "metadata": {}, + "source": [ + "In this tutorial we will be covering the use of subset selection algorithms. Subset selection offers a more efficient way of training models while minimizing accuracy loss by selecting the most optimal subset out of a given batch. This tutorial will be using CORDS, a library that offers subset selection. It is important to note that the library only uses PyTorch, and requires use of their dataloaders for each corresponding strategy. Now we will download the requisite packages:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "348bdc82-9cbd-405e-b551-abc371f192c2", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install deephyper\n", + "!pip install cords" + ] + }, + { + "cell_type": "markdown", + "id": "4fb57d5b-418e-43c0-aaaf-b11bf667718f", + "metadata": {}, + "source": [ + "Note:\n", + "As of the writing of this tutorial, the suggested use is for CPU only for CORDS use. If there is needed use on GPUs such as ThetaGPU, one could edit the module files such that the torchtext module is removed. This module conflicts with PyTorch 1.12+ since CORDS requires 0.10.0 for torchtext, which is needed to use CUDA 11.3." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "668d932f-70c4-41f4-8291-da6fe6b66810", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as f\n", + "import torch.optim as o\n", + "from torch.utils.data import DataLoader\n", + "from torch.utils.data.dataset import random_split\n", + "\n", + "import cords\n", + "from cords.utils.data.datasets.SL import gen_dataset\n", + "from cords.utils.data.dataloader.SL.adaptive import GradMatchDataLoader\n", + "from dotmap import DotMap" + ] + }, + { + "cell_type": "markdown", + "id": "5d6cdcfc-2b0f-456c-8fb5-dfedba71400c", + "metadata": {}, + "source": [ + "**Dataset**\n", + "\n", + "The dataset will be CIFAR10, an image database of 10 categories of cats, dogs, planes, etc. We can utilize one of their generated datasets to get the training, validation and tests sets. For those who want to load their data manually, use:\n", + "\n", + "> import torchvision\n", + "> \n", + "> from torchvision import CIFAR10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5de409fa-1f8f-42c1-a3c8-3c4a72b9b8e2", + "metadata": {}, + "outputs": [], + "source": [ + "def load_data():\n", + " train_ds, valid_ds, test_ds, num_cls = gen_dataset('/lus/grand/projects/datascience/ianwixom/expcifar/', 'cifar10', None, isnumpy=False)\n", + " \n", + " return train_ds, valid_ds" + ] + }, + { + "cell_type": "markdown", + "id": "e502fa35-69f5-45f6-a24a-da06f0a3f4ae", + "metadata": {}, + "source": [ + "**CORDS Prerequisites**\n", + "\n", + "CORDS requires the use of a logger, so one could configure the logger as the following:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37334c08-fab7-4aba-8cd1-085ad8d505ef", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "def __get_logger(results_dir):\n", + " os.makedirs(results_dir, exist_ok=True)\n", + " plain_formatter = logging.Formatter(\"[%(asctime)s] %(name)s %(levelname)s: %(message)s\",\n", + " datefmt=\"%m/%d %H:%M:%S\")\n", + " logger = logging.getLogger(__name__)\n", + " logger.setLevel(logging.INFO)\n", + " s_handler = logging.StreamHandler(stream=sys.stdout)\n", + " s_handler.setFormatter(plain_formatter)\n", + " s_handler.setLevel(logging.INFO)\n", + " logger.addHandler(s_handler)\n", + " f_handler = logging.FileHandler(os.path.join(results_dir, \"results.log\")) #Creates a results.log if does not currently exist\n", + " f_handler.setFormatter(plain_formatter)\n", + " f_handler.setLevel(logging.DEBUG)\n", + " logger.addHandler(f_handler)\n", + " logger.propagate = False\n", + " return logger\n", + "\n", + "results_dir = osp.abspath(osp.expanduser('results'))\n", + "logger = __get_logger(results_dir)" + ] + }, + { + "cell_type": "markdown", + "id": "cbfcd791-4682-4fd8-8a17-526af1e5a84b", + "metadata": {}, + "source": [ + "**CORDS DataLoaders**\n", + "\n", + "In order to use the subset selection algorithms provided by CORDS, their specific DataLoader must be used as well. The code below provides to examples of DataLoaders for the Grad-Match and Random strategies respectively. The variables in the \"dss_args\" mean the following:\n", + " - `eta`: learning rate\n", + " - `kappa`: used for warm-starting. Determines the amount of epochs that use the full dataset.\n", + " - `num_epochs`: total amount of epochs.\n", + " - `select_every`: determines the frequency of re-evaluating the subsets.\n", + " - `selection_type`: way that the subsets are determined.\n", + " - `valid`: use validation data for subset evaluation.\n", + " - `v1`: whether to use the new omp wrapper or not.\n", + " - `lam`: regularization coefficient.\n", + " - `eps`: the tolerance level of the algorithm's convergence.\n", + " \n", + "More information concerning different selection strategies and other variables may be found at https://github.com/decile-team/cords." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "463469f3-7178-4004-9bbd-81e007864b53", + "metadata": {}, + "outputs": [], + "source": [ + "def SubsetDL(model, criterion, optimizer, dhargs):\n", + " dss_args = dict(model = model,\n", + " loss = criterion,\n", + " eta = dhargs['lr'],\n", + " num_classes = 10,\n", + " device = 'cpu',\n", + " fraction = 0.1,\n", + " kappa = 0,\n", + " # num_epochs = 20,\n", + " select_every = 5,\n", + " linear_layer = True,\n", + " selection_type = 'PerBatch',\n", + " valid = False,\n", + " v1 = True,\n", + " lam = 1,\n", + " eps = 0.1)\n", + " dss_args = DotMap(dss_args)\n", + " \n", + " return GradMatchDataLoader(dhargs['train_d'], dhargs['valid_d'], dss_args, \n", + " logger, batch_size=dhargs['batch'], \n", + " shuffle=True, pin_memory=True)\n", + "\n", + "def RandomDL(dhargs):\n", + " dss_args = dict(select_every = 10, \n", + " kappa = 0,\n", + " fraction = 0.1,\n", + " device = 'cuda')\n", + "\n", + " dss_args = DotMap(dss_args)\n", + " return RandomDataLoader(dhargs['train_d'], dss_args, logger, \n", + " shuffle=True, pin_memory=False)" + ] + }, + { + "cell_type": "markdown", + "id": "d2c3d19f-bb6c-405d-bee0-0d0779b9b4a9", + "metadata": {}, + "source": [ + "**Training and Validation**\n", + "\n", + "Training with CORDS is similar to other PyTorch model training scripts, but one key change is the change to the loss function:\n", + " - 3 variables to call in a CORDS dataloader\n", + " - `loss = torch.dot(criterion(predictions, labels), weights / weights.sum())`\n", + " \n", + "Little to no change is needed for validation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4258e42-0b34-4819-8d78-ad3fa1456ba8", + "metadata": {}, + "outputs": [], + "source": [ + "def train(model, criterion, optimizer, scheduler, epochs, dl, valid_dl):\n", + " acc_max = 0\n", + " for i in range(epochs):\n", + " model.train()\n", + " for _, (features, labels, weights) in enumerate(dl):\n", + " features, labels, weights = features.to(device), labels.to(device, non_blocking = True), weights.to(device)\n", + " \n", + " optimizer.zero_grad()\n", + " predictions = model(features)\n", + " loss = torch.dot(criterion(predictions, labels), weights / weights.sum())\n", + " loss.backward()\n", + " optimizer.step()\n", + " \n", + " acc = valid(model, criterion, optimizer, valid_dl)\n", + " print(f\"The accuracy of the model on epoch {i} is {acc*100:1f}%\")\n", + "\n", + " if acc_max < acc:\n", + " acc_max = acc\n", + " \n", + " return acc_max\n", + "\n", + "def valid(model, criterion, optimizer, dl):\n", + " model.eval()\n", + " val_loss = 0\n", + " correct = 0\n", + " with torch.no_grad():\n", + " for _, (features, labels) in enumerate(dl):\n", + " features, labels = features.to(device), labels.to(device, non_blocking = True)\n", + " predictions = model(features)\n", + " loss = criterion(predictions, labels)\n", + " correct += (predictions.argmax(1) == labels).type(torch.float).sum().item()\n", + " return correct / len(dl.dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "d976ab9f-0508-4864-b2d3-33ed15fec455", + "metadata": {}, + "source": [ + "**Residual Network Model**\n", + "\n", + "Residual networks are types of deep learning models that utilize residuals from previous layers to improve accuracy and performance. The code below was written by CORDS developer Krishnateja Killamsetty in turn from the \"Deep Residual Learning for Image Recognition\" paper. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "976a89a6-eb41-4a5e-8f51-ae7aa0ca98e0", + "metadata": { + "jupyter": { + "source_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "class BasicBlock(nn.Module):\n", + " expansion = 1\n", + "\n", + " def __init__(self, in_planes, planes, stride=1):\n", + " super(BasicBlock, self).__init__()\n", + " self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)\n", + " self.bn1 = nn.BatchNorm2d(planes)\n", + " self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)\n", + " self.bn2 = nn.BatchNorm2d(planes)\n", + "\n", + " self.shortcut = nn.Sequential()\n", + " if stride != 1 or in_planes != self.expansion*planes:\n", + " self.shortcut = nn.Sequential(\n", + " nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),\n", + " nn.BatchNorm2d(self.expansion*planes)\n", + " )\n", + "\n", + " def forward(self, x):\n", + " out = f.relu(self.bn1(self.conv1(x)))\n", + " out = self.bn2(self.conv2(out))\n", + " out += self.shortcut(x)\n", + " out = f.relu(out)\n", + " return out\n", + "\n", + "class ResNet(nn.Module):\n", + " def __init__(self, block, num_blocks, num_classes=10):\n", + " super(ResNet, self).__init__()\n", + " self.in_planes = 64\n", + " self.embDim = 8 * self.in_planes * block.expansion\n", + " \n", + " self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)\n", + " self.bn1 = nn.BatchNorm2d(64)\n", + " self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)\n", + " self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)\n", + " self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)\n", + " self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)\n", + " self.linear = nn.Linear(512*block.expansion, num_classes)\n", + "\n", + "\n", + " def _make_layer(self, block, planes, num_blocks, stride):\n", + " strides = [stride] + [1]*(num_blocks-1)\n", + " layers = []\n", + " for stride in strides:\n", + " layers.append(block(self.in_planes, planes, stride))\n", + " self.in_planes = planes * block.expansion\n", + " return nn.Sequential(*layers)\n", + "\n", + " def forward(self, x, last=False, freeze=False):\n", + " if freeze:\n", + " with torch.no_grad():\n", + " out = f.relu(self.bn1(self.conv1(x)))\n", + " out = self.layer1(out)\n", + " out = self.layer2(out)\n", + " out = self.layer3(out)\n", + " out = self.layer4(out)\n", + " out = f.avg_pool2d(out, 4)\n", + " e = out.view(out.size(0), -1)\n", + " else:\n", + " out = f.relu(self.bn1(self.conv1(x)))\n", + " out = self.layer1(out)\n", + " out = self.layer2(out)\n", + " out = self.layer3(out)\n", + " out = self.layer4(out)\n", + " out = f.avg_pool2d(out, 4)\n", + " e = out.view(out.size(0), -1)\n", + " out = self.linear(e)\n", + " if last:\n", + " return out, e\n", + " else:\n", + " return out\n", + "\n", + " def get_embedding_dim(self):\n", + " return self.embDim" + ] + }, + { + "cell_type": "markdown", + "id": "9222dddc-1634-4f81-be1e-f90ed56b4334", + "metadata": {}, + "source": [ + "**Defining the HpProblem**\n", + "\n", + "DeepHyper allows the use of ConfigSpace as a way of defining our hyperparameters. Here, we defined our learning rate hyperparameter space with a normal distribution as well as setting the limits to avoid error:\n", + "\n", + "`lr_normal = csh.NormalFloatHyperparameter('normal_float', mu=1e-03, sigma=1e-01, log=False,\n", + " lower = 1e-05, upper = 5e-01, default_value = 1e-03)`\n", + " \n", + "After defining the space, one may simply add it to the HpProblem:\n", + "\n", + "`prob.add_hyperparameter(lr_normal)`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7126167-0c94-4ee4-8f71-834ccaf100a0", + "metadata": {}, + "outputs": [], + "source": [ + "import ConfigSpace.hyperparameters as csh\n", + "\n", + "prob = HpProblem()\n", + "\n", + "lr_normal = csh.NormalFloatHyperparameter('normal_float', mu=1e-03, sigma=1e-01, log=False,\n", + " lower = 1e-05, upper = 5e-01, default_value = 1e-03)\n", + "\n", + "optdict = {\"sgd\": o.SGD, \"rmsprop\": o.RMSprop}\n", + "optimizers = [\"rmsprop\", \"sgd\"]\n", + "\n", + "prob.add_hyperparameter(optimizers, \"optimizers\", default_value = \"sgd\")\n", + "prob.add_hyperparameter((1,50), \"t_max\", default_value = 10)\n", + "prob.add_hyperparameter((0.1,0.95), \"momentum\", default_value = 0.9)\n", + "prob.add_hyperparameter((1e-5,1e-3, 'log-uniform'), \"weightdecay\", default_value = 5e-4)\n", + "prob.add_hyperparameter(lr_normal)" + ] + }, + { + "cell_type": "markdown", + "id": "8bfa07da-063b-4270-9d0c-92d2b5f372f8", + "metadata": {}, + "source": [ + "**Defining the Run Function**\n", + "\n", + "The run function within this tutorial is very similar to other DeepHyper tutorials. Within `ResNet`, there are three inputs: block type, block structure and the number of classes. In this tutorial we do not include the Bottleneck structure seen in models like ResNet34. More information could be found at https://arxiv.org/pdf/1512.03385.pdf.\n", + "\n", + "In order to use the weights from the CORDS dataloader, the reduction of the criterion must be set to 'none'." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5873da3-c186-4513-a162-07386bae20cc", + "metadata": {}, + "outputs": [], + "source": [ + "device = torch.device(\"cpu\")\n", + "\n", + "def run(config: dict):\n", + " acc = 0\n", + " batch = 20\n", + " train_ds, valid_ds = load_data()\n", + " \n", + " train_dl = DataLoader(train_ds, batch_size = batch, shuffle = True, num_workers = 0, pin_memory = False)\n", + " valid_dl = DataLoader(valid_ds, batch_size = batch, shuffle = True, num_workers = 0, pin_memory = False)\n", + " \n", + " dhargs = {'train_d': train_dl, 'valid_d': valid_dl, 'lr': config['lr'], 'batch': batch}\n", + " block_struct = [2, 2, 2, 2]\n", + " model = ResNet(BasicBlock, block_struct, 10).to(device)\n", + " \n", + " criterion = nn.CrossEntropyLoss(reduction = 'none')\n", + " optimizer = optdict[config[\"optimizers\"]](model.parameters(), lr = config[\"lr\"])\n", + " scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config[\"t_max\"])\n", + " \n", + " subset = SubsetDL(model, criterion, optimizer, dhargs)\n", + " acc = train(model, criterion, optimizer, scheduler, epochs, subset, valid_dl)\n", + " \n", + " return acc" + ] + }, + { + "cell_type": "markdown", + "id": "bb412480-41ed-435b-b25b-f8d78a160678", + "metadata": {}, + "source": [ + "**Running the Search**\n", + "\n", + "Now that we have defined our data, training and validation, model, run function and hyperparameter space, we can now conduct a search." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d24d80c-7b33-4254-ba0b-7fb6999de0ce", + "metadata": {}, + "outputs": [], + "source": [ + "from common import NUM_WORKERS\n", + "\n", + "with Evaluator.create(\n", + " run,\n", + " method=\"process\",\n", + " method_kwargs=dict(\n", + " num_workers=NUM_WORKERS),\n", + " ) as evaluator:\n", + " if evaluator is not None:\n", + " print(f\"Creation of the Evaluator done with {evaluator.num_workers} worker(s)\")\n", + "\n", + " # Search creation\n", + " search = CBO(prob, \n", + " evaluator, \n", + " initial_points=[prob.default_configuration], \n", + " random_state = 3)\n", + "\n", + " # Search execution\n", + " print(\"Starting the search...\")\n", + " prelim_result = search.search(max_evals = -1, timeout = 2700)\n", + " print(\"Search is done\")\n", + "\n", + " prelim_result.to_csv(os.path.join(search_log_dir, f\"results.csv\"))\n", + " i_max = prelim_result.objective.argmax()\n", + "\n", + " print(f\"\\nThe default configuration has an accuracy of {prelim_result['objective'].iloc[0]:.3f}. \\n\" \\\n", + " f\"The best configuration found by DeepHyper has an accuracy {prelim_result['objective'].iloc[i_max]:.3f}, \\n\" \\\n", + " f\"finished after {prelim_result['timestamp_gather'].iloc[i_max]-prelim_result['timestamp_submit'].iloc[i_max]:.2f} seconds of search.\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "65cfd7de-e0e7-407e-9e6e-c08a0e8fa42e", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/scripts/01_Tuning_of_MPI_programs/README.rst b/tutorials/scripts/01_Tuning_of_MPI_programs/README.rst index bc6801d..4716cf4 100644 --- a/tutorials/scripts/01_Tuning_of_MPI_programs/README.rst +++ b/tutorials/scripts/01_Tuning_of_MPI_programs/README.rst @@ -5,7 +5,7 @@ Tuning of MPI Programs This tutorial demonstrates the DeepHyper ability to optimize hyperparameters of MPI programs. As a demonstration example, we write simple MPI c++ code and compile it to obtain a binary. When executed the binary initializes MPI, prints some information, and computes a polynomial with parameter obtained through the command line. In DeepHyper we will optimize this binary as black-box function which obtains some parameters through the command line (or input file) and produces a result of executioni. -This demonstration emulates a situation when a user has a binary that does some computations, and hyperparameters of these computations have to be optimized. In general, one can split binary execution into three stages. In the first, initialization stage, all necessary input files are prepared and a logging directory is created. In the second stage, an MPI program is submitted for execution. And, in the third, finalization stage, output files or artifacts are saved and analyzed, target value obtained from them is returned to DeepHyper. +This demonstration emulates a situation when a user has a binary that does some computations, and hyperparameters of these computations have to be optimized. In general, one can split binary execution into three stages. In the initialization stage, all necessary input files are prepared and a logging directory is created. In the second stage, an MPI program is submitted for execution. And, in the finalization stage, output files or artifacts are saved and analyzed, target value obtained from them is returned to DeepHyper. The tutorial requires the installation of ``parse``: @@ -44,7 +44,7 @@ As DeepHyper accepts a Python function as a target function for optimization one In ``run_mpi()`` we obtain the absolute path of the MPI binary, execute it with ``subprocess`` module, and parse the captured output. When submitting to execution in ``supbrosess.run`` we use ``mpirun`` as an executor, specify necessary environment variables and hosts with slots, and add binary ``exe`` with an argument obtained through ``config`` dictionary. The result of ``exe`` binary execution obtained through parsing is returned from ``run_mpi()``. -More powerfull wrapper with initialization and finalization +More powerful wrapper with initialization and finalization ----------------------------------------------------------- In spite of being able to run simple MPI binary this wrapper has several limitations. The simple MPI binary we compiled above obtains hyperparameters through a command-line interface but in general, binary may require input files. Therefore it is important to do some initialization before submitting binary for execution. Another drawback is the absence of finalization. In this demonstration, we create a context manager ``Experiment`` for these purposes. In the initialization phase, ``Experiment`` creates a directory for the run, and changed the correct path to it, under finalization if changes path back. In the created folder we make to two files for ``stdout`` and ``stderr`` produced by binary ``exe``. Running command is also saved. @@ -81,4 +81,4 @@ Optimization with multiple nodes Finally, we demonstrate the execution of a binary within several ranks. For ThetaGPU we obtain a list of nodes in ``get_thetagpu_nodelist()`` and specify that every node has 8 slots. The evaluator is decorated with decorator ``queue`` which manages queue of resources (ThetaGPU nodes in this example) and provides ``queue_pop_per_task`` nodes for one evaluation. In this example ``queue_pop_per_task=2`` such that every evaluation obtain two nodes with eight slots resulting in 16 ranks. The number of workers may be computed by dividing the total number of nodes by the number of nodes per evaluation. .. literalinclude:: search3.py - :language: python \ No newline at end of file + :language: python diff --git a/tutorials/scripts/04_CORDS_Training/README.rst b/tutorials/scripts/04_CORDS_Training/README.rst new file mode 100644 index 0000000..b57ef91 --- /dev/null +++ b/tutorials/scripts/04_CORDS_Training/README.rst @@ -0,0 +1,326 @@ +Hyperparameter Search with Subset Selection for Image Classification +==================================================================== + +**Author:** Ian Wixom + +In this tutorial we will be covering the use of subset selection algorithms. Subset selection offers a more efficient way of training models while minimizing accuracy loss by selecting the most optimal subset out of a given batch. In particular, it is very useful for hyperparameter searches. This tutorial will be using CORDS, a library that offers subset selection. It is important to note that the library only uses PyTorch, and requires use of their dataloaders for each corresponding strategy. Now we will download the required package: + +.. code-block:: console + + $ pip install cords + +.. note:: + + As of the writing of this tutorial, the suggested use is for CPU only for CORDS use. If there is needed use on GPUs such as ThetaGPU, one could edit the module files such that the torchtext module is removed. This module conflicts with PyTorch 1.12+ since CORDS requires 0.10.0 for torchtext, which is needed to use CUDA 11.3. We will use the following imports to develop and train our model: + +.. code-block:: python + + import torch + import torch.nn as nn + import torch.nn.functional as f + import torch.optim as o + from torch.utils.data import DataLoader + + import cords + from cords.utils.data.datasets.SL import gen_dataset + from cords.utils.data.dataloader.SL.adaptive import GradMatchDataLoader, RandomDataLoader, OLRandomDataLoader + from dotmap import DotMap + + import deephyper as dh + from deephyper.problem import HpProblem + from deephyper.evaluator import Evaluator, profile + from deephyper.search.hps import CBO + from deephyper.evaluator.callback import TqdmCallback + + import pathlib + import os + import os.path as osp + import logging + import sys + +The dataset will be CIFAR10, an image database of 10 categories of cats, dogs, planes, etc. We can utilize one of their generated datasets to get the training, validation and tests sets. For those who want to load their data manually, use: + +.. code-block:: python + + import torchvision + from torchvision import CIFAR10 + +We can use CORDS' dataset generator to simplify preprocessing. Make sure to adapt the output path of the downloaded dataset ``/lus/grand/projects/datascience/ianwixom/expcifar/``: + +.. code-block:: python + + def load_data(): + train_ds, valid_ds, test_ds, num_cls = gen_dataset('/lus/grand/projects/datascience/ianwixom/expcifar/', 'cifar10', None, isnumpy=False) + + return train_ds, valid_ds + +**CORDS Prerequisites** + +CORDS requires the use of a logger, so one could configure the logger as the following based from their tutorials: + +.. code-block:: python + + def __get_logger(results_dir): + os.makedirs(results_dir, exist_ok=True) + plain_formatter = logging.Formatter("[%(asctime)s] %(name)s %(levelname)s: %(message)s", + datefmt="%m/%d %H:%M:%S") + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + s_handler = logging.StreamHandler(stream=sys.stdout) + s_handler.setFormatter(plain_formatter) + s_handler.setLevel(logging.INFO) + logger.addHandler(s_handler) + f_handler = logging.FileHandler(os.path.join(results_dir, "results.log")) + f_handler.setFormatter(plain_formatter) + f_handler.setLevel(logging.DEBUG) + logger.addHandler(f_handler) + logger.propagate = False + return logger + + results_dir = osp.abspath(osp.expanduser('results')) + logger = __get_logger(results_dir) + +**CORDS DataLoaders** + +In order to use the subset selection algorithms provided by CORDS, their specific DataLoader must be used as well. The code below provides two examples of DataLoaders for the Grad-Match and Random strategies respectively. The variables in the ``dss_args`` mean the following: + +- ``eta``: learning rate +- ``kappa``: used for warm-starting. Determines the amount of epochs that use the full dataset. +- ``num_epochs``: total amount of epochs. +- ``select_every``: determines the frequency of re-evaluating the subsets. +- ``selection_type``: way that the subsets are determined. +- ``valid``: use validation data for subset evaluation. +- ``v1``: whether to use the new omp wrapper or not. +- ``lam``: regularization coefficient. +- ``eps``: the tolerance level of the algorithm's convergence. + +More information concerning different selection strategies and other variables may be found at https://github.com/decile-team/cords. The Random strategy was included as another potential example if using a GPU. + +.. code-block:: python + + def RandomDL(dhargs): + dss_args = dict(select_every = 10, + kappa = 0, + fraction = 0.125, + device = 'cuda') + + dss_args = DotMap(dss_args) + return RandomDataLoader(dhargs['train_d'], dss_args, logger, + batch_size=dhargs['batch'], shuffle=True, pin_memory=True) + + def SubsetDL(model, criterion, dhargs): + dss_args = dict(model = model, + loss = criterion, + eta = dhargs['lr'], + num_classes = 10, + device = 'cpu', + fraction = 0.125, + kappa = 0, + select_every = 10, + linear_layer = False, + selection_type = 'PerBatch', + valid = False, + v1 = True, + lam = dhargs['reg_coeff'], + eps = 0.1) + + dss_args = DotMap(dss_args) + + return GradMatchDataLoader(dhargs['train_d'], dhargs['valid_d'], dss_args, + logger, batch_size=dhargs['batch'], + shuffle=True, pin_memory=True) + +**Training and Validation** + +Training with CORDS is similar to other PyTorch model training scripts, but one key change is the change to the loss function: + +- 3 variables to call in a CORDS dataloader +- ``loss = torch.dot(criterion(predictions, labels), weights / weights.sum())`` + +Since weights are used to update the subset, it must be included in any loop or loss funcion calculation. Little to no change is needed for validation compared to previous tutorials. + +.. code-block:: python + + def train(model, criterion, optimizer, scheduler, epochs, dl, valid_dl): + acc_max = 0 + for i in range(epochs): + model.train() + for _, (features, labels, weights) in enumerate(dl): + features, labels, weights = features.to(device), labels.to(device, non_blocking = True), weights.to(device) + + optimizer.zero_grad() + predictions = model(features) + loss = torch.dot(criterion(predictions, labels), weights / weights.sum()) + loss.backward() + optimizer.step() + + acc = valid(model, criterion, optimizer, valid_dl) + print(f"The accuracy of the model on epoch {i} is {acc*100:1f}%") + + if acc_max < acc: + acc_max = acc + + return acc_max + + def valid(model, optimizer, dl): + model.eval() + correct = 0 + with torch.no_grad(): + for _, (features, labels) in enumerate(dl): + features, labels = features.to(device), labels.to(device, non_blocking = True) + predictions = model(features) + correct += (predictions.argmax(1) == labels).type(torch.float).sum().item() + return correct / len(dl.dataset) + +**Residual Network Model** + +Residual networks are types of deep learning models that utilize residuals from previous layers to improve accuracy and performance. The code below with some slight modification was written by CORDS developer Krishnateja Killamsetty in turn from the "Deep Residual Learning for Image Recognition" paper. + +.. code-block:: python + + class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + out = f.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out += self.shortcut(x) + out = f.relu(out) + return out + + class ResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super(ResNet, self).__init__() + self.in_planes = 64 + self.embDim = 8 * self.in_planes * block.expansion + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + self.linear = nn.Linear(512*block.expansion, num_classes) + + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x, last=False, freeze=False): + if freeze: + with torch.no_grad(): + out = f.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = f.avg_pool2d(out, 4) + e = out.view(out.size(0), -1) + else: + out = f.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = f.avg_pool2d(out, 4) + e = out.view(out.size(0), -1) + out = self.linear(e) + if last: + return out, e + else: + return out + + def get_embedding_dim(self): + return self.embDim + +**Defining the Run Function** + +The run function within this tutorial is very similar to other DeepHyper tutorials. Within ``ResNet``, there are three inputs: block type, block structure and the number of classes. In this tutorial we do not include the Bottleneck structure seen in models like ResNet34. More information could be found at https://arxiv.org/pdf/1512.03385.pdf. + +In order to use the weights from the CORDS dataloader, the reduction of the criterion must be set to ``none``. + +.. code-block:: python + + def run(config: dict): + acc = 0 + batch = 20 + + train_ds, valid_ds = load_data() + + train_dl = DataLoader(train_ds, batch_size = batch, shuffle = True, num_workers = 0, pin_memory = False) + valid_dl = DataLoader(valid_ds, batch_size = batch, shuffle = True, num_workers = 0, pin_memory = False) + + dhargs = {'train_d': train_dl, 'valid_d': valid_dl, 'lr': config['lr'], 'batch': batch} + block_struct = [2, 2, 2, 2] + model = ResNet(BasicBlock, block_struct, 10).to(device) + + criterion = nn.CrossEntropyLoss(reduction = 'none') + optimizer = optdict[config["optimizers"]](model.parameters(), lr = config["lr"]) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config["t_max"]) + + subset = SubsetDL(model, criterion, optimizer, dhargs) + acc = train(model, criterion, optimizer, scheduler, epochs, subset, valid_dl) + + return acc + +**Running the Search** + +Now that we have defined our data, training and validation, model, run function and hyperparameter space, we can now conduct a search. Note that in the script file, the code was formatted for MPI usage. Altering all references of device from ``device = torch.device("cuda", rank)`` to ``device = torch.device("cpu")`` would be sufficient to change to CPU. + +.. code-block:: python + + if __name__ == "__main__": + method_kwargs = {"callbacks": [TqdmCallback()]} + + prob = HpProblem() + + prob.add_hyperparameter((1e-05,5e-01, 'log-uniform'), "lr") + prob.add_hyperparameter((0.1,0.95), "momentum") + prob.add_hyperparameter((1e-5,1e-3, 'log-uniform'), "weightdecay") + prob.add_hyperparameter((0.01, 10.0, 'log-uniform'), 'regularization') + prob.add_hyperparameter((1, 50), "t_max") + + epochs = 50 + + with Evaluator.create( + run, + method="thread", + method_kwargs=method_kwargs + ) as evaluator: + if evaluator is not None: + print(f"Creation of the Evaluator done with {evaluator.num_workers} worker(s)") + + # Search creation + search = CBO(prob, + evaluator) + + # Search execution + print("Starting the search...") + prelim_result = search.search(max_evals = 50) + print("Search is done") + + prelim_result.to_csv(os.path.join(search_log_dir, f"results.csv")) + i_max = prelim_result.objective.argmax() + + print(f"\nThe default configuration has an accuracy of {prelim_result['objective'].iloc[0]:.3f}. \n" \ + f"The best configuration found by DeepHyper has an accuracy {prelim_result['objective'].iloc[i_max]:.3f}, \n" \ + f"finished after {prelim_result['timestamp_gather'].iloc[i_max]-prelim_result['timestamp_submit'].iloc[i_max]:.2f} seconds of search.\n") diff --git a/tutorials/scripts/04_CORDS_Training/cifar10cordsmodel.py b/tutorials/scripts/04_CORDS_Training/cifar10cordsmodel.py new file mode 100644 index 0000000..652323a --- /dev/null +++ b/tutorials/scripts/04_CORDS_Training/cifar10cordsmodel.py @@ -0,0 +1,345 @@ +import torch +import torch.nn as nn +import torch.nn.functional as f +import torch.optim as o +from torch.utils.data import DataLoader + +import mpi4py +from mpi4py import MPI + +import cords +from cords.utils.data.datasets.SL import gen_dataset +from cords.utils.data.dataloader.SL.adaptive import GradMatchDataLoader, RandomDataLoader +from dotmap import DotMap + +import deephyper as dh +from deephyper.problem import HpProblem +from deephyper.evaluator import Evaluator, profile +from deephyper.search.hps import CBO +from deephyper.evaluator.callback import TqdmCallback + +import pathlib +import os +import os.path as osp +import logging +import sys + +mpi4py.rc.initialize = False +mpi4py.rc.threads = True +mpi4py.rc.thread_level = "multiple" + +if not MPI.Is_initialized(): + MPI.Init_thread() + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() + +search_log_dir = "search_log/" +pathlib.Path(search_log_dir).mkdir(parents=False, exist_ok=True) + +def load_data(): + train_ds, valid_ds, test_ds, num_cls = gen_dataset('/lus/grand/projects/datascience/ianwixom/expcifar/', 'cifar10', None, isnumpy=False) + + return train_ds, valid_ds +# Lines between __get_logger and SubsetDL are CORDS based +def __get_logger(results_dir): + os.makedirs(results_dir, exist_ok=True) + plain_formatter = logging.Formatter("[%(asctime)s] %(name)s %(levelname)s: %(message)s", + datefmt="%m/%d %H:%M:%S") + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + s_handler = logging.StreamHandler(stream=sys.stdout) + s_handler.setFormatter(plain_formatter) + s_handler.setLevel(logging.INFO) + logger.addHandler(s_handler) + f_handler = logging.FileHandler(os.path.join(results_dir, "results.log")) + f_handler.setFormatter(plain_formatter) + f_handler.setLevel(logging.DEBUG) + logger.addHandler(f_handler) + logger.propagate = False + return logger + +results_dir = osp.abspath(osp.expanduser('results')) +logger = __get_logger(results_dir) + +def RandomDL(dhargs, rank): + dss_args = dict(select_every = 10, + kappa = 0, + fraction = 0.125, + device = 'cuda{}'.format(rank)) + + dss_args = DotMap(dss_args) + return RandomDataLoader(dhargs['train_d'], dss_args, logger, + batch_size=dhargs['batch'], shuffle=True, pin_memory=True) + +def SubsetDL(model, criterion, dhargs, rank): + dss_args = dict(model = model, + loss = criterion, + eta = dhargs['lr'], + num_classes = 10, + device = 'cuda:{}'.format(rank), + fraction = 0.125, + kappa = 0, + select_every = 10, + linear_layer = False, + selection_type = 'PerBatch', + valid = False, + v1 = True, + lam = dhargs['reg_coeff'], + eps = 0.1) + + dss_args = DotMap(dss_args) + + return GradMatchDataLoader(dhargs['train_d'], dhargs['valid_d'], dss_args, + logger, batch_size=dhargs['batch'], + shuffle=True, pin_memory=True) + +def train(model, criterion, optimizer, scheduler, epochs, dl, valid_dl, device): + acc_max = 0 + + acc = valid(model, criterion, valid_dl, device) + print(f"The accuracy of the model of worker {rank} on epoch 0 is {acc*100}%") + + for i in range(epochs): + model.train() + + for _, (features, labels, weights) in enumerate(dl): + features, labels, weights = features.to(device), labels.to(device, non_blocking = True), weights.to(device) + + optimizer.zero_grad(set_to_none=True) + predictions = model(features) + loss = torch.dot(criterion(predictions, labels), weights / weights.sum()) + loss.backward() + optimizer.step() + + if (i % 10 == 0): + acc = valid(model, criterion, valid_dl, device) + print(f"The accuracy of the model of worker {rank} on epoch {i+1} is {acc*100}%") + + if acc_max < acc: + acc_max = acc + scheduler.step() + + acc = valid(model, criterion, valid_dl, device) + print(f"The accuracy of the model of worker {rank} on epoch {epochs} is {acc*100}%") + del loss + + if acc_max < acc: + acc_max = acc + return acc_max + +def train_nocords(model, criterion, optimizer, scheduler, epochs, dl, valid_dl, device): + acc_max = 0 + + acc = valid(model, criterion, valid_dl, device) + print(f"The accuracy of the model of worker {rank} on epoch 0 is {round(acc*100, 2)}%") + + for i in range(epochs): + model.train() + + for _, (features, labels) in enumerate(dl): + features, labels = features.to(device), labels.to(device, non_blocking = True) + + optimizer.zero_grad(set_to_none=True) + predictions = model(features) + loss = criterion(predictions, labels) + loss.backward() + optimizer.step() + + if (i % 10 == 0): + acc = valid(model, criterion, valid_dl, device) + print(f"The accuracy of the model of worker {rank} on epoch {i+1} is {round(acc*100, 2)}%") + + if acc_max < acc: + acc_max = acc + scheduler.step() + + acc = valid(model, criterion, valid_dl, device) + print(f"The accuracy of the model of worker {rank} on epoch {epochs} is {round(acc*100, 2)}%") + + if acc_max < acc: + acc_max = acc + return round(acc_max, 4) + +def valid(model, criterion, dl, device): + model.eval() + # val_loss = 0 + correct = 0 + with torch.no_grad(): + for _, (features, labels) in enumerate(dl): + features, labels = features.to(device), labels.to(device, non_blocking = True) + predictions = model(features) + # loss = criterion(predictions, labels) # use if one wants loss instead of correct + correct += float((predictions.argmax(1) == labels).type(torch.float).sum().item()) + return correct / len(dl.dataset) + +# Lines between BasicBlock and ResNet34 are (mostly) unmodified ResNet code from CORDS. +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + out = f.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out += self.shortcut(x) + out = f.relu(out) + return out + +class ResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super(ResNet, self).__init__() + self.in_planes = 64 + self.embDim = 8 * self.in_planes * block.expansion + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + self.linear = nn.Linear(512*block.expansion, num_classes) + + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x, last=False, freeze=False): + if freeze: + with torch.no_grad(): + out = f.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = f.avg_pool2d(out, 4) + e = out.view(out.size(0), -1) + else: + out = f.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = f.avg_pool2d(out, 4) + e = out.view(out.size(0), -1) + out = self.linear(e) + if last: + return out, e + else: + return out + + def get_embedding_dim(self): + return self.embDim + +@profile +def run(config: dict): + + is_cords = True + acc = 0 + batch = 32 + device = torch.device("cuda" if is_gpu_available else "cpu") + if is_gpu_available and n_gpus > 1: + device = torch.device("cuda", rank) + srank = rank + print("Running on GPU ", rank) + elif is_gpu_available and n_gpus == 1: + device = torch.device("cuda") + srank = 0 + print("Running on the GPU") + else: + device = torch.device("cpu") + print("Running on the CPU") + + train_ds, valid_ds = load_data() + train_dl = DataLoader(train_ds, batch_size = batch, shuffle = True, num_workers = 0, pin_memory = True) + valid_dl = DataLoader(valid_ds, batch_size = batch, shuffle = True, num_workers = 0, pin_memory = True) + dhargs = {'train_d': train_dl, 'valid_d': valid_dl, 'lr': config['lr'], 'batch': batch, 'reg_coeff': config['regularization']} + block_struct = [2, 2, 2, 2] + + model = ResNet(BasicBlock, block_struct, 10) + + model.to(device) + if is_cords: + criterion = nn.CrossEntropyLoss(reduction = 'none') + else: + criterion = nn.CrossEntropyLoss() + optimizer = o.RMSprop(model.parameters(), lr=config["lr"], + momentum=config['momentum'], + weight_decay=config['weightdecay']) + + scheduler = o.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config["t_max"]) + if is_cords: + subset = RandomDL(dhargs, srank) + acc = train(model, criterion, optimizer, scheduler, epochs, subset, valid_dl, device) + else: + acc = train_nocords(model, criterion, optimizer, scheduler, epochs, train_dl, valid_dl, device) + + #Free GPU memory + del model + torch.cuda.empty_cache() + return acc + +if __name__ == "__main__": + method_kwargs = {"callbacks": [TqdmCallback()]} + + is_gpu_available = torch.cuda.is_available() + + n_gpus = torch.cuda.device_count() if is_gpu_available else 0 + + prob = HpProblem() + + prob.add_hyperparameter((1e-05,5e-01, 'log-uniform'), "lr") + prob.add_hyperparameter((0.1,0.95), "momentum") + prob.add_hyperparameter((1e-5,1e-3, 'log-uniform'), "weightdecay") + prob.add_hyperparameter((0.01, 10.0, 'log-uniform'), 'regularization') + prob.add_hyperparameter((1, 50), "t_max") + + epochs = 50 + + if rank == 0: + # Evaluator creation + print("Creation of the Evaluator...") + + with Evaluator.create( + run, + method="mpicomm", + method_kwargs=method_kwargs + ) as evaluator: + if evaluator is not None: + print(f"Creation of the Evaluator done with {evaluator.num_workers} worker(s)") + + # Search creation + search = CBO(prob, + evaluator) + + # Search execution + print("Starting the search...") + prelim_result = search.search(max_evals = 50) + print("Search is done") + + prelim_result.to_csv(os.path.join(search_log_dir, f"results.csv")) + i_max = prelim_result.objective.argmax() + + print(f"\nThe default configuration has an accuracy of {prelim_result['objective'].iloc[0]:.3f}. \n" \ + f"The best configuration found by DeepHyper has an accuracy {prelim_result['objective'].iloc[i_max]:.3f}, \n" \ + f"finished after {prelim_result['timestamp_gather'].iloc[i_max]-prelim_result['timestamp_submit'].iloc[i_max]:.2f} seconds of search.\n") + + + diff --git a/tutorials/scripts/04_CORDS_Training/cifar10tutorial.py b/tutorials/scripts/04_CORDS_Training/cifar10tutorial.py new file mode 100644 index 0000000..5c97d28 --- /dev/null +++ b/tutorials/scripts/04_CORDS_Training/cifar10tutorial.py @@ -0,0 +1,262 @@ +import torch +import torch.nn as nn +import torch.nn.functional as f +import torch.optim as o +from torch.utils.data import DataLoader + +import cords +from cords.utils.data.datasets.SL import gen_dataset +from cords.utils.data.dataloader.SL.adaptive import GradMatchDataLoader, RandomDataLoader, OLRandomDataLoader +from dotmap import DotMap + +import deephyper as dh +from deephyper.problem import HpProblem +from deephyper.evaluator import Evaluator, profile +from deephyper.search.hps import CBO +from deephyper.evaluator.callback import TqdmCallback + +import pathlib +import os +import os.path as osp +import logging +import sys + +import mpi4py +from mpi4py import MPI + +mpi4py.rc.initialize = False +mpi4py.rc.threads = True +mpi4py.rc.thread_level = "multiple" + +if not MPI.Is_initialized(): + MPI.Init_thread() + +comm = MPI.COMM_WORLD +rank = comm.Get_rank() + +search_log_dir = "search_log/" +pathlib.Path(search_log_dir).mkdir(parents=False, exist_ok=True) + +search_log_dir = "search_log/" +pathlib.Path(search_log_dir).mkdir(parents=False, exist_ok=True) + +def load_data(): + train_ds, valid_ds, test_ds, num_cls = gen_dataset('/lus/grand/projects/datascience/ianwixom/expcifar/', 'cifar10', None, isnumpy=False) + + return train_ds, valid_ds + +def __get_logger(results_dir): + os.makedirs(results_dir, exist_ok=True) + plain_formatter = logging.Formatter("[%(asctime)s] %(name)s %(levelname)s: %(message)s", + datefmt="%m/%d %H:%M:%S") + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + s_handler = logging.StreamHandler(stream=sys.stdout) + s_handler.setFormatter(plain_formatter) + s_handler.setLevel(logging.INFO) + logger.addHandler(s_handler) + f_handler = logging.FileHandler(os.path.join(results_dir, "results.log")) + f_handler.setFormatter(plain_formatter) + f_handler.setLevel(logging.DEBUG) + logger.addHandler(f_handler) + logger.propagate = False + return logger + +results_dir = osp.abspath(osp.expanduser('results')) +logger = __get_logger(results_dir) + +def RandomDL(dhargs): + dss_args = dict(select_every = 10, + kappa = 0, + fraction = 0.125, + device = 'cuda') + + dss_args = DotMap(dss_args) + return RandomDataLoader(dhargs['train_d'], dss_args, logger, + batch_size=dhargs['batch'], shuffle=True, pin_memory=True) + +def SubsetDL(model, criterion, dhargs): + dss_args = dict(model = model, + loss = criterion, + eta = dhargs['lr'], + num_classes = 10, + device = 'cpu', + fraction = 0.125, + kappa = 0, + select_every = 10, + linear_layer = False, + selection_type = 'PerBatch', + valid = False, + v1 = True, + lam = dhargs['reg_coeff'], + eps = 0.1) + + dss_args = DotMap(dss_args) + + return GradMatchDataLoader(dhargs['train_d'], dhargs['valid_d'], dss_args, + logger, batch_size=dhargs['batch'], + shuffle=True, pin_memory=True) + +def train(model, criterion, optimizer, scheduler, epochs, dl, valid_dl): + acc_max = 0 + for i in range(epochs): + model.train() + for _, (features, labels, weights) in enumerate(dl): + features, labels, weights = features.to(device), labels.to(device, non_blocking = True), weights.to(device) + + optimizer.zero_grad() + predictions = model(features) + loss = torch.dot(criterion(predictions, labels), weights / weights.sum()) + loss.backward() + optimizer.step() + + acc = valid(model, criterion, optimizer, valid_dl) + print(f"The accuracy of the model on epoch {i} is {acc*100:1f}%") + + if acc_max < acc: + acc_max = acc + + return acc_max + +def valid(model, optimizer, dl): + model.eval() + correct = 0 + with torch.no_grad(): + for _, (features, labels) in enumerate(dl): + features, labels = features.to(device), labels.to(device, non_blocking = True) + predictions = model(features) + correct += (predictions.argmax(1) == labels).type(torch.float).sum().item() + return correct / len(dl.dataset) + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + out = f.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out += self.shortcut(x) + out = f.relu(out) + return out + +class ResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super(ResNet, self).__init__() + self.in_planes = 64 + self.embDim = 8 * self.in_planes * block.expansion + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + self.linear = nn.Linear(512*block.expansion, num_classes) + + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x, last=False, freeze=False): + if freeze: + with torch.no_grad(): + out = f.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = f.avg_pool2d(out, 4) + e = out.view(out.size(0), -1) + else: + out = f.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = f.avg_pool2d(out, 4) + e = out.view(out.size(0), -1) + out = self.linear(e) + if last: + return out, e + else: + return out + + def get_embedding_dim(self): + return self.embDim + +def run(config: dict): + acc = 0 + batch = 20 + + train_ds, valid_ds = load_data() + + train_dl = DataLoader(train_ds, batch_size = batch, shuffle = True, num_workers = 0, pin_memory = False) + valid_dl = DataLoader(valid_ds, batch_size = batch, shuffle = True, num_workers = 0, pin_memory = False) + + dhargs = {'train_d': train_dl, 'valid_d': valid_dl, 'lr': config['lr'], 'batch': batch} + block_struct = [2, 2, 2, 2] + model = ResNet(BasicBlock, block_struct, 10).to(device) + + criterion = nn.CrossEntropyLoss(reduction = 'none') + optimizer = optdict[config["optimizers"]](model.parameters(), lr = config["lr"]) + scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config["t_max"]) + + subset = SubsetDL(model, criterion, optimizer, dhargs) + acc = train(model, criterion, optimizer, scheduler, epochs, subset, valid_dl) + + return acc + +if __name__ == "__main__": + method_kwargs = {"callbacks": [TqdmCallback()]} + + prob = HpProblem() + + prob.add_hyperparameter((1e-05,5e-01, 'log-uniform'), "lr") + prob.add_hyperparameter((0.1,0.95), "momentum") + prob.add_hyperparameter((1e-5,1e-3, 'log-uniform'), "weightdecay") + prob.add_hyperparameter((0.01, 10.0, 'log-uniform'), 'regularization') + prob.add_hyperparameter((1, 50), "t_max") + + epochs = 50 + + with Evaluator.create( + run, + method="mpicomm", + method_kwargs=method_kwargs + ) as evaluator: + if evaluator is not None: + print(f"Creation of the Evaluator done with {evaluator.num_workers} worker(s)") + + # Search creation + search = CBO(prob, + evaluator) + + # Search execution + print("Starting the search...") + prelim_result = search.search(max_evals = 50) + print("Search is done") + + prelim_result.to_csv(os.path.join(search_log_dir, f"results.csv")) + i_max = prelim_result.objective.argmax() + + print(f"\nThe default configuration has an accuracy of {prelim_result['objective'].iloc[0]:.3f}. \n" \ + f"The best configuration found by DeepHyper has an accuracy {prelim_result['objective'].iloc[i_max]:.3f}, \n" \ + f"finished after {prelim_result['timestamp_gather'].iloc[i_max]-prelim_result['timestamp_submit'].iloc[i_max]:.2f} seconds of search.\n")