@@ -18,21 +18,19 @@ pip install catalyst
1818<p >
1919
2020``` bash
21- CUDA_VISIBLE_DEVICES=" 0" python train_resnet.py --engine=de
21+ CUDA_VISIBLE_DEVICES=" 0" python train_resnet.py
2222
2323CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=dp
2424
2525# distributed training
26- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=ddp --sync-bn
26+ CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=ddp
2727
2828# multi-node distributed training
2929CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=ddp \
3030 --master-addr=127.0.0.1 \
31- --master-port=2112 \
3231 --world-size=8 \
3332 --dist-rank=0 \
34- --num-workers=8 \
35- --sync-bn
33+ --num-workers=8
3634```
3735</p >
3836</details >
@@ -44,21 +42,19 @@ CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=ddp \
4442``` bash
4543pip install datasets transformers
4644
47- CUDA_VISIBLE_DEVICES=" 0" python train_albert.py --engine=de
45+ CUDA_VISIBLE_DEVICES=" 0" python train_albert.py
4846
4947CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=dp
5048
5149# distributed training
52- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=ddp --sync-bn
50+ CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=ddp
5351
5452# multi-node distributed training
5553CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=ddp \
5654 --master-addr=127.0.0.1 \
57- --master-port=2112 \
5855 --world-size=8 \
5956 --dist-rank=0 \
60- --num-workers=8 \
61- --sync-bn
57+ --num-workers=8
6258```
6359</p >
6460</details >
@@ -73,19 +69,19 @@ pip install torch>=1.8.0 catalyst
7369<p >
7470
7571``` bash
76- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=amp-dp
72+ CUDA_VISIBLE_DEVICES=" 0" python train_resnet.py --engine=gpu-amp
73+
74+ CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=dp-amp
7775
7876# distributed training
79- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=amp- ddp --sync-bn
77+ CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=ddp-amp
8078
8179# multi-node distributed training
82- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=amp- ddp \
80+ CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=ddp-amp \
8381 --master-addr=127.0.0.1 \
84- --master-port=2112 \
8582 --world-size=8 \
8683 --dist-rank=0 \
87- --num-workers=8 \
88- --sync-bn
84+ --num-workers=8
8985```
9086</p >
9187</details >
@@ -97,23 +93,24 @@ CUDA_VISIBLE_DEVICES="0,1" python train_resnet.py --engine=amp-ddp \
9793``` bash
9894pip install datasets transformers
9995
100- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=amp-dp
96+ CUDA_VISIBLE_DEVICES=" 0" python train_albert.py --engine=gpu-amp
97+
98+ CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=dp-amp
10199
102100# distributed training
103- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=amp- ddp --sync-bn
101+ CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=ddp-amp
104102
105103# multi-node distributed training
106- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=amp- ddp \
104+ CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=ddp-amp \
107105 --master-addr=127.0.0.1 \
108- --master-port=2112 \
109106 --world-size=8 \
110107 --dist-rank=0 \
111- --num-workers=8 \
112- --sync-bn
108+ --num-workers=8
113109```
114110</p >
115111</details >
116112
113+ <!--
117114### PyTorch XLA
118115```bash
119116pip install catalyst
@@ -145,198 +142,4 @@ python train_albert.py --engine=xla-ddp
145142```
146143</p>
147144</details>
148-
149- ## Extensions
150-
151- ### Nvidia APEX
152- ``` bash
153- pip install catalyst && install-apex
154- # or git clone https://github.com/NVIDIA/apex && cd apex && pip install -e .
155- ```
156-
157- <details open >
158- <summary >CV - ResNet</summary >
159- <p >
160-
161- ``` bash
162- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=apex-dp
163-
164- # distributed training
165- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=apex-ddp --sync-bn
166-
167- # multi-node distributed training
168- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=apex-ddp \
169- --master-addr=127.0.0.1 \
170- --master-port=2112 \
171- --world-size=8 \
172- --dist-rank=0 \
173- --num-workers=8 \
174- --sync-bn
175- ```
176- </p >
177- </details >
178-
179- <details >
180- <summary >NLP - Albert</summary >
181- <p >
182-
183- ``` bash
184- pip install datasets transformers
185-
186- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=apex-dp
187-
188- # distributed training
189- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=apex-ddp --sync-bn
190-
191- # multi-node distributed training
192- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=apex-ddp \
193- --master-addr=127.0.0.1 \
194- --master-port=2112 \
195- --world-size=8 \
196- --dist-rank=0 \
197- --num-workers=8 \
198- --sync-bn
199- ```
200- </p >
201- </details >
202-
203- ### DeepSpeed
204- > * Tested under ` docker pull deepspeed/deepspeed:v031_torch17_cuda11 and pip install -U torch==1.7.0 deepspeed==0.4.1 catalyst==21.12 ` .*
205- ``` bash
206- # docker pull deepspeed/deepspeed:v031_torch17_cuda11
207- # docker run --rm -it -v $(pwd):/workspace deepspeed/deepspeed:v031_torch17_cuda11 /bin/bash
208- pip install catalyst[deepspeed]
209- ```
210-
211- <details open >
212- <summary >CV - ResNet</summary >
213- <p >
214-
215- ``` bash
216- # distributed training
217- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=ds-ddp
218-
219- # multi-node distributed training
220- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=ds-ddp \
221- --master-addr=127.0.0.1 \
222- --master-port=2112 \
223- --world-size=8 \
224- --dist-rank=0 \
225- --num-workers=8
226- ```
227- </p >
228- </details >
229-
230- <details >
231- <summary >NLP - Albert</summary >
232- <p >
233-
234- ``` bash
235- pip install datasets transformers
236-
237- # distributed training
238- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=ds-ddp
239-
240- # multi-node distributed training
241- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=ds-ddp \
242- --master-addr=127.0.0.1 \
243- --master-port=2112 \
244- --world-size=8 \
245- --dist-rank=0 \
246- --num-workers=8 \
247- --sync-bn
248- ```
249- </p >
250- </details >
251-
252- ### FairScale
253- > * Tested under ` pip install -U torch==1.8.1 fairscale==0.3.7 catalyst==21.12 ` *
254- ``` bash
255- pip install torch> =1.8.0 catalyst[fairscale]
256- ```
257-
258- <details open >
259- <summary >CV - ResNet</summary >
260- <p >
261-
262- ``` bash
263- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=fs-pp
264-
265- # distributed training
266- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=fs-ddp --sync-bn
267-
268- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=fs-ddp-amp --sync-bn
269-
270- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=fs-fddp --sync-bn
271-
272- # multi-node distributed training
273- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=fs-ddp \
274- --master-addr=127.0.0.1 \
275- --master-port=2112 \
276- --world-size=8 \
277- --dist-rank=0 \
278- --num-workers=8 \
279- --sync-bn
280-
281- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=fs-ddp-amp \
282- --master-addr=127.0.0.1 \
283- --master-port=2112 \
284- --world-size=8 \
285- --dist-rank=0 \
286- --num-workers=8 \
287- --sync-bn
288-
289- CUDA_VISIBLE_DEVICES=" 0,1" python train_resnet.py --engine=fs-fddp \
290- --master-addr=127.0.0.1 \
291- --master-port=2112 \
292- --world-size=8 \
293- --dist-rank=0 \
294- --num-workers=8 \
295- --sync-bn
296- ```
297- </p >
298- </details >
299-
300- <details >
301- <summary >NLP - Albert</summary >
302- <p >
303-
304- ``` bash
305- pip install datasets transformers
306-
307- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=fs-pp
308-
309- # distributed training
310- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=fs-ddp --sync-bn
311-
312- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=fs-ddp-amp --sync-bn
313-
314- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=fs-fddp --sync-bn
315-
316- # multi-node distributed training
317- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=fs-ddp \
318- --master-addr=127.0.0.1 \
319- --master-port=2112 \
320- --world-size=8 \
321- --dist-rank=0 \
322- --num-workers=8 \
323- --sync-bn
324-
325- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=fs-ddp-amp \
326- --master-addr=127.0.0.1 \
327- --master-port=2112 \
328- --world-size=8 \
329- --dist-rank=0 \
330- --num-workers=8 \
331- --sync-bn
332-
333- CUDA_VISIBLE_DEVICES=" 0,1" python train_albert.py --engine=fs-fddp \
334- --master-addr=127.0.0.1 \
335- --master-port=2112 \
336- --world-size=8 \
337- --dist-rank=0 \
338- --num-workers=8 \
339- --sync-bn
340- ```
341- </p >
342- </details >
145+ -->
0 commit comments