mirror of
https://github.com/Vision-CAIR/MiniGPT-4.git
synced 2025-04-07 03:20:48 +00:00
36 lines
849 B
Bash
36 lines
849 B
Bash
#!/bin/bash
|
|
set -x
|
|
|
|
NNODES=${NNODES:-1}
|
|
NODE_RANK=${NODE_RANK:-0}
|
|
GPUS=${GPUS:-${ARNOLD_WORKER_GPU}}
|
|
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
|
|
MASTER_PORT=${MASTER_PORT:-9909}
|
|
# ARNOLD_WORKER_0_PORT
|
|
# ARNOLD_WORKER_0_ADDR
|
|
|
|
# settings for torch log
|
|
export BYTED_TORCH_FX=O1
|
|
export BYTED_TORCH_BYTECCL=O1
|
|
export TOKENIZERS_PARALLELISM=false
|
|
export HADOOP_ROOT_LOGGER=error,console
|
|
|
|
# settings for DDP multi-node for lab.pytorch image >= 1.13
|
|
export OMP_NUM_THREADS=8
|
|
export NCCL_IB_DISABLE=0
|
|
export NCCL_IB_GID_INDEX=3
|
|
export NCCL_SOCKET_IFNAME=eth0
|
|
export NCCL_SHM_DISABLE=0
|
|
|
|
# start training
|
|
CONFIG=$1
|
|
torchrun --nnodes=$NNODES \
|
|
--node_rank=$NODE_RANK \
|
|
--nproc_per_node=$GPUS \
|
|
--master_addr=$MASTER_ADDR \
|
|
--master_port=$MASTER_PORT \
|
|
train.py \
|
|
--cfg-path \
|
|
$CONFIG \
|
|
${@:2}
|