Skip to content

Commit 7b41850

Browse files
authored
fix: ddp for multi-node setup fixed (#1402)
1 parent d6849fe commit 7b41850

5 files changed

Lines changed: 77 additions & 254 deletions

File tree

catalyst/engines/torch.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,14 @@ class DistributedDataParallelEngine(Engine):
5959
during distributed training
6060
world_size: the number of processes to use for distributed training.
6161
Should be less or equal to the number of GPUs
62+
workers_dist_rank: the rank of the first process to run on the node.
63+
It should be a number between `number of initialized processes`
64+
and `world_size - 1`, the other processes on the node wiil have ranks
65+
`# of initialized processes + 1`, `# of initialized processes + 2`, ...,
66+
`# of initialized processes + num_node_workers - 1`
67+
num_node_workers: the number of processes to launch on the node.
68+
For GPU training, this is recommended to be set to the number of GPUs
69+
on the current node so that each process can be bound to a single GPU
6270
process_group_kwargs: parameters for `torch.distributed.init_process_group`.
6371
More info here:
6472
https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group # noqa: E501, W505
@@ -72,13 +80,17 @@ def __init__(
7280
address: str = "127.0.0.1",
7381
port: Union[str, int] = 2112,
7482
world_size: Optional[int] = None,
83+
workers_dist_rank: int = 0,
84+
num_node_workers: Optional[int] = None,
7585
process_group_kwargs: Dict[str, Any] = None,
7686
**kwargs
7787
):
7888
"""Init."""
7989
self._address = os.environ.get("MASTER_ADDR", address)
8090
self._port = os.environ.get("MASTER_PORT", port)
81-
self._world_size = world_size
91+
self._num_local_workers = num_node_workers or torch.cuda.device_count() or 1
92+
self._workers_global_rank = workers_dist_rank
93+
self._world_size = world_size or self._num_local_workers
8294
self._process_group_kwargs = process_group_kwargs or {}
8395
self._args = args
8496
self._kwargs = kwargs
@@ -100,11 +112,10 @@ def spawn(self, fn: Callable, *args, **kwargs):
100112
Returns:
101113
wrapped function (if needed).
102114
"""
103-
world_size: int = self._world_size or torch.cuda.device_count()
104115
return mp.spawn(
105116
fn,
106-
args=(world_size,),
107-
nprocs=world_size,
117+
args=(self._world_size,),
118+
nprocs=self._num_local_workers,
108119
join=True,
109120
)
110121

@@ -121,10 +132,12 @@ def setup(self, local_rank: int, world_size: int):
121132
"world_size": world_size,
122133
**self._process_group_kwargs,
123134
}
135+
global_rank = self._workers_global_rank + local_rank
136+
124137
os.environ["MASTER_ADDR"] = str(self._address)
125138
os.environ["MASTER_PORT"] = str(self._port)
126139
os.environ["WORLD_SIZE"] = str(world_size)
127-
os.environ["RANK"] = str(local_rank)
140+
os.environ["RANK"] = str(global_rank)
128141
os.environ["LOCAL_RANK"] = str(local_rank)
129142
dist.init_process_group(**process_group_kwargs)
130143
super().__init__(self, *self._args, **self._kwargs)

examples/engines/README.md

Lines changed: 20 additions & 217 deletions
Original file line numberDiff line numberDiff line change
@@ -18,21 +18,19 @@ pip install catalyst
1818
<p>
1919

2020
```bash
21-
CUDA_VISIBLE_DEVICES="0" python train_resnet.py --engine=de
21+
CUDA_VISIBLE_DEVICES="0" python train_resnet.py
2222

2323
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=dp
2424

2525
# distributed training
26-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp --sync-bn
26+
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp
2727

2828
# multi-node distributed training
2929
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp \
3030
--master-addr=127.0.0.1 \
31-
--master-port=2112 \
3231
--world-size=8 \
3332
--dist-rank=0 \
34-
--num-workers=8 \
35-
--sync-bn
33+
--num-workers=8
3634
```
3735
</p>
3836
</details>
@@ -44,21 +42,19 @@ CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp \
4442
```bash
4543
pip install datasets transformers
4644

47-
CUDA_VISIBLE_DEVICES="0" python train_albert.py --engine=de
45+
CUDA_VISIBLE_DEVICES="0" python train_albert.py
4846

4947
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=dp
5048

5149
# distributed training
52-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp --sync-bn
50+
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp
5351

5452
# multi-node distributed training
5553
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp \
5654
--master-addr=127.0.0.1 \
57-
--master-port=2112 \
5855
--world-size=8 \
5956
--dist-rank=0 \
60-
--num-workers=8 \
61-
--sync-bn
57+
--num-workers=8
6258
```
6359
</p>
6460
</details>
@@ -73,19 +69,19 @@ pip install torch>=1.8.0 catalyst
7369
<p>
7470

7571
```bash
76-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-dp
72+
CUDA_VISIBLE_DEVICES="0" python train_resnet.py --engine=gpu-amp
73+
74+
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=dp-amp
7775

7876
# distributed training
79-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-ddp --sync-bn
77+
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp-amp
8078

8179
# multi-node distributed training
82-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-ddp \
80+
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp-amp \
8381
--master-addr=127.0.0.1 \
84-
--master-port=2112 \
8582
--world-size=8 \
8683
--dist-rank=0 \
87-
--num-workers=8 \
88-
--sync-bn
84+
--num-workers=8
8985
```
9086
</p>
9187
</details>
@@ -97,23 +93,24 @@ CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-ddp \
9793
```bash
9894
pip install datasets transformers
9995

100-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=amp-dp
96+
CUDA_VISIBLE_DEVICES="0" python train_albert.py --engine=gpu-amp
97+
98+
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=dp-amp
10199

102100
# distributed training
103-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=amp-ddp --sync-bn
101+
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp-amp
104102

105103
# multi-node distributed training
106-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=amp-ddp \
104+
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ddp-amp \
107105
--master-addr=127.0.0.1 \
108-
--master-port=2112 \
109106
--world-size=8 \
110107
--dist-rank=0 \
111-
--num-workers=8 \
112-
--sync-bn
108+
--num-workers=8
113109
```
114110
</p>
115111
</details>
116112

113+
<!--
117114
### PyTorch XLA
118115
```bash
119116
pip install catalyst
@@ -145,198 +142,4 @@ python train_albert.py --engine=xla-ddp
145142
```
146143
</p>
147144
</details>
148-
149-
## Extensions
150-
151-
### Nvidia APEX
152-
```bash
153-
pip install catalyst && install-apex
154-
# or git clone https://github.com/NVIDIA/apex && cd apex && pip install -e .
155-
```
156-
157-
<details open>
158-
<summary>CV - ResNet</summary>
159-
<p>
160-
161-
```bash
162-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=apex-dp
163-
164-
# distributed training
165-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=apex-ddp --sync-bn
166-
167-
# multi-node distributed training
168-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=apex-ddp \
169-
--master-addr=127.0.0.1 \
170-
--master-port=2112 \
171-
--world-size=8 \
172-
--dist-rank=0 \
173-
--num-workers=8 \
174-
--sync-bn
175-
```
176-
</p>
177-
</details>
178-
179-
<details>
180-
<summary>NLP - Albert</summary>
181-
<p>
182-
183-
```bash
184-
pip install datasets transformers
185-
186-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=apex-dp
187-
188-
# distributed training
189-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=apex-ddp --sync-bn
190-
191-
# multi-node distributed training
192-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=apex-ddp \
193-
--master-addr=127.0.0.1 \
194-
--master-port=2112 \
195-
--world-size=8 \
196-
--dist-rank=0 \
197-
--num-workers=8 \
198-
--sync-bn
199-
```
200-
</p>
201-
</details>
202-
203-
### DeepSpeed
204-
> *Tested under `docker pull deepspeed/deepspeed:v031_torch17_cuda11 and pip install -U torch==1.7.0 deepspeed==0.4.1 catalyst==21.12`.*
205-
```bash
206-
# docker pull deepspeed/deepspeed:v031_torch17_cuda11
207-
# docker run --rm -it -v $(pwd):/workspace deepspeed/deepspeed:v031_torch17_cuda11 /bin/bash
208-
pip install catalyst[deepspeed]
209-
```
210-
211-
<details open>
212-
<summary>CV - ResNet</summary>
213-
<p>
214-
215-
```bash
216-
# distributed training
217-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ds-ddp
218-
219-
# multi-node distributed training
220-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ds-ddp \
221-
--master-addr=127.0.0.1 \
222-
--master-port=2112 \
223-
--world-size=8 \
224-
--dist-rank=0 \
225-
--num-workers=8
226-
```
227-
</p>
228-
</details>
229-
230-
<details>
231-
<summary>NLP - Albert</summary>
232-
<p>
233-
234-
```bash
235-
pip install datasets transformers
236-
237-
# distributed training
238-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ds-ddp
239-
240-
# multi-node distributed training
241-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=ds-ddp \
242-
--master-addr=127.0.0.1 \
243-
--master-port=2112 \
244-
--world-size=8 \
245-
--dist-rank=0 \
246-
--num-workers=8 \
247-
--sync-bn
248-
```
249-
</p>
250-
</details>
251-
252-
### FairScale
253-
> *Tested under `pip install -U torch==1.8.1 fairscale==0.3.7 catalyst==21.12`*
254-
```bash
255-
pip install torch>=1.8.0 catalyst[fairscale]
256-
```
257-
258-
<details open>
259-
<summary>CV - ResNet</summary>
260-
<p>
261-
262-
```bash
263-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-pp
264-
265-
# distributed training
266-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp --sync-bn
267-
268-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp-amp --sync-bn
269-
270-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-fddp --sync-bn
271-
272-
# multi-node distributed training
273-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp \
274-
--master-addr=127.0.0.1 \
275-
--master-port=2112 \
276-
--world-size=8 \
277-
--dist-rank=0 \
278-
--num-workers=8 \
279-
--sync-bn
280-
281-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-ddp-amp \
282-
--master-addr=127.0.0.1 \
283-
--master-port=2112 \
284-
--world-size=8 \
285-
--dist-rank=0 \
286-
--num-workers=8 \
287-
--sync-bn
288-
289-
CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=fs-fddp \
290-
--master-addr=127.0.0.1 \
291-
--master-port=2112 \
292-
--world-size=8 \
293-
--dist-rank=0 \
294-
--num-workers=8 \
295-
--sync-bn
296-
```
297-
</p>
298-
</details>
299-
300-
<details>
301-
<summary>NLP - Albert</summary>
302-
<p>
303-
304-
```bash
305-
pip install datasets transformers
306-
307-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-pp
308-
309-
# distributed training
310-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-ddp --sync-bn
311-
312-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-ddp-amp --sync-bn
313-
314-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-fddp --sync-bn
315-
316-
# multi-node distributed training
317-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-ddp \
318-
--master-addr=127.0.0.1 \
319-
--master-port=2112 \
320-
--world-size=8 \
321-
--dist-rank=0 \
322-
--num-workers=8 \
323-
--sync-bn
324-
325-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-ddp-amp \
326-
--master-addr=127.0.0.1 \
327-
--master-port=2112 \
328-
--world-size=8 \
329-
--dist-rank=0 \
330-
--num-workers=8 \
331-
--sync-bn
332-
333-
CUDA_VISIBLE_DEVICES="0,1" python train_albert.py --engine=fs-fddp \
334-
--master-addr=127.0.0.1 \
335-
--master-port=2112 \
336-
--world-size=8 \
337-
--dist-rank=0 \
338-
--num-workers=8 \
339-
--sync-bn
340-
```
341-
</p>
342-
</details>
145+
-->

0 commit comments

Comments
 (0)