LIAGM
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎DAEFR/.DS_Store
12 KB b/‎DAEFR/.DS_Store
12 KB
diff --git a/‎DAEFR/distributed/__init__.py
Lines changed: 13 additions & 0 deletions b/‎DAEFR/distributed/__init__.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎DAEFR/distributed/distributed.py
Lines changed: 143 additions & 0 deletions b/‎DAEFR/distributed/distributed.py
Lines changed: 143 additions & 0 deletions
diff --git a/‎DAEFR/distributed/launch.py
Lines changed: 92 additions & 0 deletions b/‎DAEFR/distributed/launch.py
Lines changed: 92 additions & 0 deletions
diff --git a/‎DAEFR/models/.DS_Store
6 KB b/‎DAEFR/models/.DS_Store
6 KB
@@ -0,0 +1,3 @@
+__pycache__
+experiments/
+data/
@@ -0,0 +1,13 @@
+from .distributed import (
+    get_rank,
+    get_local_rank,
+    is_primary,
+    synchronize,
+    get_world_size,
+    all_reduce,
+    all_gather,
+    reduce_dict,
+    data_sampler,
+    LOCAL_PROCESS_GROUP,
+)
+from .launch import launch
@@ -0,0 +1,143 @@
+import math
+import pickle
+
+import torch
+from torch import distributed as dist
+from torch.utils import data
+
+
+LOCAL_PROCESS_GROUP = None
+
+
+def is_primary():
+    return get_rank() == 0
+
+
+def get_rank():
+    if not dist.is_available():
+        return 0
+
+    if not dist.is_initialized():
+        return 0
+
+    return dist.get_rank()
+
+
+def get_local_rank():
+    if not dist.is_available():
+        return 0
+
+    if not dist.is_initialized():
+        return 0
+
+    if LOCAL_PROCESS_GROUP is None:
+        raise ValueError("tensorfn.distributed.LOCAL_PROCESS_GROUP is None")
+
+    return dist.get_rank(group=LOCAL_PROCESS_GROUP)
+
+
+def synchronize():
+    if not dist.is_available():
+        return
+
+    if not dist.is_initialized():
+        return
+
+    world_size = dist.get_world_size()
+
+    if world_size == 1:
+        return
+
+    dist.barrier()
+
+
+def get_world_size():
+    if not dist.is_available():
+        return 1
+
+    if not dist.is_initialized():
+        return 1
+
+    return dist.get_world_size()
+
+
+def all_reduce(tensor, op=dist.ReduceOp.SUM):
+    world_size = get_world_size()
+
+    if world_size == 1:
+        return tensor
+
+    dist.all_reduce(tensor, op=op)
+
+    return tensor
+
+
+def all_gather(data):
+    world_size = get_world_size()
+
+    if world_size == 1:
+        return [data]
+
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    local_size = torch.IntTensor([tensor.numel()]).to("cuda")
+    size_list = [torch.IntTensor([1]).to("cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
+
+    if local_size != max_size:
+        padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
+        tensor = torch.cat((tensor, padding), 0)
+
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    world_size = get_world_size()
+
+    if world_size < 2:
+        return input_dict
+
+    with torch.no_grad():
+        keys = []
+        values = []
+
+        for k in sorted(input_dict.keys()):
+            keys.append(k)
+            values.append(input_dict[k])
+
+        values = torch.stack(values, 0)
+        dist.reduce(values, dst=0)
+
+        if dist.get_rank() == 0 and average:
+            values /= world_size
+
+        reduced_dict = {k: v for k, v in zip(keys, values)}
+
+    return reduced_dict
+
+
+def data_sampler(dataset, shuffle, distributed):
+    if distributed:
+        return data.distributed.DistributedSampler(dataset, shuffle=shuffle)
+
+    if shuffle:
+        return data.RandomSampler(dataset)
+
+    else:
+        return data.SequentialSampler(dataset)
@@ -0,0 +1,92 @@
+import os
+
+import torch
+from torch import distributed as dist
+from torch import multiprocessing as mp
+
+from . import distributed as dist_fn
+
+
+def find_free_port():
+    import socket
+
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+
+    sock.bind(("", 0))
+    port = sock.getsockname()[1]
+    sock.close()
+
+    return port
+
+
+def launch(fn, n_gpu_per_machine, n_machine=1, machine_rank=0, dist_url=None, args=()):
+    world_size = n_machine * n_gpu_per_machine
+
+    if world_size > 1:
+        if "OMP_NUM_THREADS" not in os.environ:
+            os.environ["OMP_NUM_THREADS"] = "1"
+
+        if dist_url == "auto":
+            if n_machine != 1:
+                raise ValueError('dist_url="auto" not supported in multi-machine jobs')
+
+            port = find_free_port()
+            dist_url = f"tcp://127.0.0.1:{port}"
+
+        if n_machine > 1 and dist_url.startswith("file://"):
+            raise ValueError(
+                "file:// is not a reliable init method in multi-machine jobs. Prefer tcp://"
+            )
+
+        mp.spawn(
+            distributed_worker,
+            nprocs=n_gpu_per_machine,
+            args=(fn, world_size, n_gpu_per_machine, machine_rank, dist_url, args),
+            daemon=False,
+        )
+
+    else:
+        fn(*args)
+
+
+def distributed_worker(
+    local_rank, fn, world_size, n_gpu_per_machine, machine_rank, dist_url, args
+):
+    if not torch.cuda.is_available():
+        raise OSError("CUDA is not available. Please check your environments")
+
+    global_rank = machine_rank * n_gpu_per_machine + local_rank
+
+    try:
+        dist.init_process_group(
+            backend="NCCL",
+            init_method=dist_url,
+            world_size=world_size,
+            rank=global_rank,
+        )
+
+    except Exception:
+        raise OSError("failed to initialize NCCL groups")
+
+    dist_fn.synchronize()
+
+    if n_gpu_per_machine > torch.cuda.device_count():
+        raise ValueError(
+            f"specified n_gpu_per_machine larger than available device ({torch.cuda.device_count()})"
+        )
+
+    torch.cuda.set_device(local_rank)
+
+    if dist_fn.LOCAL_PROCESS_GROUP is not None:
+        raise ValueError("torch.distributed.LOCAL_PROCESS_GROUP is not None")
+
+    n_machine = world_size // n_gpu_per_machine
+
+    for i in range(n_machine):
+        ranks_on_i = list(range(i * n_gpu_per_machine, (i + 1) * n_gpu_per_machine))
+        pg = dist.new_group(ranks_on_i)
+
+        if i == machine_rank:
+            dist_fn.distributed.LOCAL_PROCESS_GROUP = pg
+
+    fn(*args)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+__pycache__`
	`2`	`+experiments/`
	`3`	`+data/`