catalyst-team
diff --git a/‎catalyst/engines/torch.py‎
Lines changed: 18 additions & 5 deletions b/‎catalyst/engines/torch.py‎
Lines changed: 18 additions & 5 deletions
diff --git a/‎examples/engines/README.md‎
Lines changed: 20 additions & 217 deletions b/‎examples/engines/README.md‎
Lines changed: 20 additions & 217 deletions
@@ -59,6 +59,14 @@ class DistributedDataParallelEngine(Engine):
             during distributed training
         world_size: the number of processes to use for distributed training.
             Should be less or equal to the number of GPUs
+        workers_dist_rank: the rank of the first process to run on the node.
+            It should be a number between `number of initialized processes`
+            and `world_size - 1`, the other processes on the node wiil have ranks
+            `# of initialized processes + 1`, `# of initialized processes + 2`, ...,
+            `# of initialized processes + num_node_workers - 1`
+        num_node_workers: the number of processes to launch on the node.
+            For GPU training, this is recommended to be set to the number of GPUs
+            on the current node so that each process can be bound to a single GPU
         process_group_kwargs: parameters for `torch.distributed.init_process_group`.
             More info here:
             https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group  # noqa: E501, W505
@@ -72,13 +80,17 @@ def __init__(
         address: str = "127.0.0.1",
         port: Union[str, int] = 2112,
         world_size: Optional[int] = None,
+        workers_dist_rank: int = 0,
+        num_node_workers: Optional[int] = None,
         process_group_kwargs: Dict[str, Any] = None,
         **kwargs
     ):
         """Init."""
         self._address = os.environ.get("MASTER_ADDR", address)
         self._port = os.environ.get("MASTER_PORT", port)
-        self._world_size = world_size
+        self._num_local_workers = num_node_workers or torch.cuda.device_count() or 1
+        self._workers_global_rank = workers_dist_rank
+        self._world_size = world_size or self._num_local_workers
         self._process_group_kwargs = process_group_kwargs or {}
         self._args = args
         self._kwargs = kwargs
@@ -100,11 +112,10 @@ def spawn(self, fn: Callable, *args, **kwargs):
         Returns:
             wrapped function (if needed).
         """
-        world_size: int = self._world_size or torch.cuda.device_count()
         return mp.spawn(
             fn,
-            args=(world_size,),
-            nprocs=world_size,
+            args=(self._world_size,),
+            nprocs=self._num_local_workers,
             join=True,
         )
 
@@ -121,10 +132,12 @@ def setup(self, local_rank: int, world_size: int):
             "world_size": world_size,
             **self._process_group_kwargs,
         }
+        global_rank = self._workers_global_rank + local_rank
+
         os.environ["MASTER_ADDR"] = str(self._address)
         os.environ["MASTER_PORT"] = str(self._port)
         os.environ["WORLD_SIZE"] = str(world_size)
-        os.environ["RANK"] = str(local_rank)
+        os.environ["RANK"] = str(global_rank)
         os.environ["LOCAL_RANK"] = str(local_rank)
         dist.init_process_group(**process_group_kwargs)
         super().__init__(self, *self._args, **self._kwargs)
 
@@ -18,21 +18,19 @@ pip install catalyst
 <p>
 
 ```bash
-CUDA_VISIBLE_DEVICES="0" python train_resnet.py --engine=de
+CUDA_VISIBLE_DEVICES="0" python train_resnet.py
 
 CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=dp
 
 # distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp --sync-bn
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp
 
 # multi-node distributed training
 CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp \
     --master-addr=127.0.0.1 \
-    --master-port=2112 \
     --world-size=8 \
     --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
+    --num-workers=8
 ```
 </p>
 </details>
@@ -44,21 +42,19 @@ CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp \
 ```bash
 pip install datasets transformers
 
-CUDA_VISIBLE_DEVICES="0" python train_albert.py --engine=de
+CUDA_VISIBLE_DEVICES="0" python train_albert.py
 
 CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=dp
 
 # distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp --sync-bn
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp
 
 # multi-node distributed training
 CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp \
     --master-addr=127.0.0.1 \
-    --master-port=2112 \
     --world-size=8 \
     --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
+    --num-workers=8
 ```
 </p>
 </details>
@@ -73,19 +69,19 @@ pip install torch>=1.8.0 catalyst
 <p>
 
 ```bash
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-dp
+CUDA_VISIBLE_DEVICES="0" python train_resnet.py --engine=gpu-amp
+
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=dp-amp
 
 # distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-ddp --sync-bn
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp-amp
 
 # multi-node distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-ddp \
+CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp-amp \
     --master-addr=127.0.0.1 \
-    --master-port=2112 \
     --world-size=8 \
     --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
+    --num-workers=8
 ```
 </p>
 </details>
@@ -97,23 +93,24 @@ CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-ddp \
 ```bash
 pip install datasets transformers
 
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=amp-dp
+CUDA_VISIBLE_DEVICES="0" python train_albert.py --engine=gpu-amp
+
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=dp-amp
 
 # distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=amp-ddp --sync-bn
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp-amp
 
 # multi-node distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=amp-ddp \
+CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp-amp \
     --master-addr=127.0.0.1 \
-    --master-port=2112 \
     --world-size=8 \
     --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
+    --num-workers=8
 ```
 </p>
 </details>
 
+<!--
 ### PyTorch XLA
 ```bash
 pip install catalyst
@@ -145,198 +142,4 @@ python train_albert.py --engine=xla-ddp
 ```
 </p>
 </details>
-
-## Extensions
-
-### Nvidia APEX
-```bash
-pip install catalyst && install-apex
-# or git clone https://github.com/NVIDIA/apex && cd apex && pip install -e .
-```
-
-<details open>
-<summary>CV - ResNet</summary>
-<p>
-
-```bash
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=apex-dp
-
-# distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=apex-ddp --sync-bn
-
-# multi-node distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=apex-ddp \
-    --master-addr=127.0.0.1 \
-    --master-port=2112 \
-    --world-size=8 \
-    --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
-```
-</p>
-</details>
-
-<details>
-<summary>NLP - Albert</summary>
-<p>
-
-```bash
-pip install datasets transformers
-
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=apex-dp
-
-# distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=apex-ddp --sync-bn
-
-# multi-node distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=apex-ddp \
-    --master-addr=127.0.0.1 \
-    --master-port=2112 \
-    --world-size=8 \
-    --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
-```
-</p>
-</details>
-
-### DeepSpeed
-> *Tested under `docker pull deepspeed/deepspeed:v031_torch17_cuda11 and pip install -U torch==1.7.0 deepspeed==0.4.1 catalyst==21.12`.*
-```bash
-# docker pull deepspeed/deepspeed:v031_torch17_cuda11
-# docker run --rm -it -v $(pwd):/workspace deepspeed/deepspeed:v031_torch17_cuda11 /bin/bash
-pip install catalyst[deepspeed]
-```
-
-<details open>
-<summary>CV - ResNet</summary>
-<p>
-
-```bash
-# distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ds-ddp
-
-# multi-node distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ds-ddp \
-    --master-addr=127.0.0.1 \
-    --master-port=2112 \
-    --world-size=8 \
-    --dist-rank=0 \
-    --num-workers=8
-```
-</p>
-</details>
-
-<details>
-<summary>NLP - Albert</summary>
-<p>
-
-```bash
-pip install datasets transformers
-
-# distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ds-ddp
-
-# multi-node distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ds-ddp \
-    --master-addr=127.0.0.1 \
-    --master-port=2112 \
-    --world-size=8 \
-    --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
-```
-</p>
-</details>
-
-### FairScale
-> *Tested under `pip install -U torch==1.8.1 fairscale==0.3.7 catalyst==21.12`*
-```bash
-pip install torch>=1.8.0 catalyst[fairscale]
-```
-
-<details open>
-<summary>CV - ResNet</summary>
-<p>
-
-```bash
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-pp
-
-# distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp --sync-bn
-
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp-amp --sync-bn
-
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-fddp --sync-bn
-
-# multi-node distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp \
-    --master-addr=127.0.0.1 \
-    --master-port=2112 \
-    --world-size=8 \
-    --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
-
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp-amp \
-    --master-addr=127.0.0.1 \
-    --master-port=2112 \
-    --world-size=8 \
-    --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
-
-CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-fddp \
-    --master-addr=127.0.0.1 \
-    --master-port=2112 \
-    --world-size=8 \
-    --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
-```
-</p>
-</details>
-
-<details>
-<summary>NLP - Albert</summary>
-<p>
-
-```bash
-pip install datasets transformers
-
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-pp
-
-# distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-ddp --sync-bn
-
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-ddp-amp --sync-bn
-
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-fddp --sync-bn
-
-# multi-node distributed training
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-ddp \
-    --master-addr=127.0.0.1 \
-    --master-port=2112 \
-    --world-size=8 \
-    --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
-
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-ddp-amp \
-    --master-addr=127.0.0.1 \
-    --master-port=2112 \
-    --world-size=8 \
-    --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
-
-CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-fddp \
-    --master-addr=127.0.0.1 \
-    --master-port=2112 \
-    --world-size=8 \
-    --dist-rank=0 \
-    --num-workers=8 \
-    --sync-bn
-```
-</p>
-</details>
+-->