Deepspeed: Difference between revisions

From Alliance Doc
Jump to navigation Jump to search
(Created page with "DeepSpeed is a deep learning training optimization library, providing the means to train massive billion parameter models at scale. Fully compatible with PyTorch, DeepSpeed features implementations of novel memory-efficient distributed training methods, based on the Zero Redundancy Optimizer (ZeRO) concept. Through the use of ZeRO, DeepSpeed enables distributed storage and computing of different elements of a training task - such as optimizer states, model weights, model...")
 
 
(8 intermediate revisions by the same user not shown)
Line 17: Line 17:


{{File
{{File
   |name=accelerate-example.sh
   |name=deepspeed-example.sh
   |lang="bash"
   |lang="bash"
   |contents=
   |contents=
Line 29: Line 29:
#SBATCH --output=%N-%j.out
#SBATCH --output=%N-%j.out


## Create a virtualenv and install accelerate + its dependencies on all nodes ##
## Create a virtualenv and install deepspeed + its dependencies on all nodes ##
srun -N $SLURM_NNODES -n $SLURM_NNODES config_env.sh
srun -N $SLURM_NNODES -n $SLURM_NNODES config_env.sh


export HEAD_NODE=$(hostname) # store head node's address
export HEAD_NODE=$(hostname) # store head node's address
module load cuda/11.4


srun launch_training_deepspeed.sh
srun launch_training_deepspeed.sh
Line 59: Line 61:
}}
}}


The script <code>launch_training_deepseed.sh</code> is:
The script <code>launch_training_deepseed.sh</code> is as shown below. Notice that we use [https://pytorch.org/docs/stable/elastic/run.html torchrun] to launch our python script. While Deepspeed has [https://pytorch.org/docs/stable/elastic/run.html its own launcher], we do not recommend using it at this time:


{{File
{{File
   |name=launch_training_accelerate.sh
   |name=launch_training_deepspeed.sh
   |lang="bash"
   |lang="bash"
   |contents=
   |contents=
Line 68: Line 70:


source $SLURM_TMPDIR/env/bin/activate
source $SLURM_TMPDIR/env/bin/activate
export NCCL_BLOCKING_WAIT=1
export NCCL_ASYNC_ERROR_HANDLING=1
#export NCCL_ASYNC_ERROR_HANDLING=1
#export NCCL_DEBUG=INFO


echo "r$SLURM_NODEID master: $HEAD_NODE"
echo "r$SLURM_NODEID master: $HEAD_NODE"
echo "r$SLURM_NODEID Launching python script"
echo "r$SLURM_NODEID Launching python script"


time torchrun \
torchrun \
--nnodes $SLURM_NNODES \
--nnodes $SLURM_NNODES \
--nproc_per_node 2 \ # This is the number of GPUs on each node!
--nproc_per_node 2 \ # This is the number of GPUs on each node!
--rdzv_backend c10d \
--rdzv_backend c10d \
--rdzv_endpoint "$HEAD_NODE" \
--rdzv_endpoint "$HEAD_NODE" \
pytorch-deepspeed.py --batch_size=256 --num_workers=2 --deepspeed_config="./ds_config.json"
pytorch-deepspeed.py --deepspeed_config="./ds_config.json"
}}
}}


The file <code>ds_config.json</code> is:
Next we define and configure our training task in the file <code>ds_config.json</code>. Here we setup ZeRO stage 0, meaning ZerRO is disabled - no model parallelism will take place and this will be a purely data parallel job. We also enable mixed-precision training, where some tensors are computed/stored in half-precision (fp16) to accelerate computations using up less memory space. See [https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.runtime.zero.config.DeepSpeedZeroConfig Deepspeed's documentation] for more details on all configurable parameters.


{{File
{{File
   |name=pytorch-accelerate.py
   |name=ds_config.json
   |lang="json"
   |lang="json"
   |contents=
   |contents=
Line 141: Line 141:


{{File
{{File
   |name=pytorch-accelerate.py
   |name=pytorch-deepspeed.py
   |lang="python"
   |lang="python"
   |contents=
   |contents=
Line 166: Line 166:
import argparse
import argparse


parser = argparse.ArgumentParser(description='cifar10 classification models, distributed data parallel test')
parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed test')
parser.add_argument('--batch_size', type=int, default=768, help='')
parser.add_argument('--local_rank', type=int, default=-1, help='local rank argument added automatically by the launcher')
parser.add_argument('--with_cuda', type=bool, default=True, help='')
parser.add_argument('--num_workers', type=int, default=0, help='')
parser.add_argument('--local_rank',type=int,default=-1,help='local rank passed from distributed launcher')
parser.add_argument('--max_epochs',type=int,default=2,help='local rank passed from distributed launcher')


parser = deepspeed.add_config_arguments(parser)
parser = deepspeed.add_config_arguments(parser)


def main():
def main():
Line 206: Line 201:


     parameters = filter(lambda p: p.requires_grad, net.parameters())
     parameters = filter(lambda p: p.requires_grad, net.parameters())


     transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
     transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


     dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
     dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
    train_sampler = torch.utils.data.distributed.DistributedSampler(dataset_train)
    train_loader = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.num_workers, sampler=train_sampler)


     model_engine, optimizer, trainloader, __ = deepspeed.initialize(args=args, model=net, model_parameters=parameters, training_data=dataset_train)
     model_engine, optimizer, trainloader, __ = deepspeed.initialize(args=args, model=net, model_parameters=parameters, training_data=dataset_train)
Line 221: Line 212:
     criterion = nn.CrossEntropyLoss().cuda()
     criterion = nn.CrossEntropyLoss().cuda()


    for epoch in range(args.max_epochs):
     for batch_idx, (inputs, targets) in enumerate(trainloader):
 
        train_sampler.set_epoch(epoch)
 
        train(epoch, model_engine, criterion, train_loader)
 
def train(epoch, model_engine, criterion, train_loader):
 
    train_loss = 0
    correct = 0
    total = 0
 
    epoch_start = time.time()
 
     for batch_idx, (inputs, targets) in enumerate(train_loader):
 
      start = time.time()


       inputs = inputs.to(model_engine.local_rank)
       inputs = inputs.to(model_engine.local_rank)

Latest revision as of 13:02, 26 May 2023

DeepSpeed is a deep learning training optimization library, providing the means to train massive billion parameter models at scale. Fully compatible with PyTorch, DeepSpeed features implementations of novel memory-efficient distributed training methods, based on the Zero Redundancy Optimizer (ZeRO) concept. Through the use of ZeRO, DeepSpeed enables distributed storage and computing of different elements of a training task - such as optimizer states, model weights, model gradients and model activations - across multiple devices, including GPU, CPU, local hard disk, and/or combinations of these devices. This "pooling" of resources, notably for storage, allows models with massive amounts of parameters to be trained efficiently, across multiple nodes, without explicitly handling Model, Pipeline or Data Parallelism in your code.

Installing Deepspeed

Our recommendation is to install it using our provided Python wheel as follows:

1. Load a Python module, thus module load python
2. Create and start a virtual environment.
3. Install both PyTorch and Deepspeed in the virtual environment with pip install.
Question.png
(venv) [name@server ~] pip install --no-index torch deepspeed

Multi-GPU and multi-node jobs with Deepspeed

In the example that follows, we use deepspeed to reproduce our PyTorch tutorial on how to train a model with multiple GPUs distributed over multiple nodes. Notable differences are:

1. Here we define and configure several common elements of the training task (such as optimizer, learning rate scheduler, batch size and more) in a config file, rather than using code in the main python script.
2. We also define Deepspeed specific configurations, such as what modality of ZeRO to utilize, in a config file.


File : deepspeed-example.sh

#!/bin/bash
#SBATCH --nodes 2
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-task=2
#SBATCH --cpus-per-task=4 
#SBATCH --mem=16000M       
#SBATCH --time=0-00:10
#SBATCH --output=%N-%j.out

## Create a virtualenv and install deepspeed + its dependencies on all nodes ##
srun -N $SLURM_NNODES -n $SLURM_NNODES config_env.sh

export HEAD_NODE=$(hostname) # store head node's address

module load cuda/11.4

srun launch_training_deepspeed.sh


Where the script config_env.sh is:


File : config_env.sh

#!/bin/bash

module load python

virtualenv --no-download $SLURM_TMPDIR/ENV

source $SLURM_TMPDIR/ENV/bin/activate

pip install --upgrade pip --no-index

pip install --no-index torchvision deepspeed

echo "Done installing virtualenv!"


The script launch_training_deepseed.sh is as shown below. Notice that we use torchrun to launch our python script. While Deepspeed has its own launcher, we do not recommend using it at this time:


File : launch_training_deepspeed.sh

#!/bin/bash

source $SLURM_TMPDIR/env/bin/activate
export NCCL_ASYNC_ERROR_HANDLING=1

echo "r$SLURM_NODEID master: $HEAD_NODE"
echo "r$SLURM_NODEID Launching python script"

torchrun \
--nnodes $SLURM_NNODES \
--nproc_per_node 2 \ # This is the number of GPUs on each node!
--rdzv_backend c10d \
--rdzv_endpoint "$HEAD_NODE" \
pytorch-deepspeed.py --deepspeed_config="./ds_config.json"


Next we define and configure our training task in the file ds_config.json. Here we setup ZeRO stage 0, meaning ZerRO is disabled - no model parallelism will take place and this will be a purely data parallel job. We also enable mixed-precision training, where some tensors are computed/stored in half-precision (fp16) to accelerate computations using up less memory space. See Deepspeed's documentation for more details on all configurable parameters.


File : ds_config.json

{
  "train_batch_size": 16,
  "steps_per_print": 2000,
  "optimizer": {
    "type": "Adam",
    "params": {
      "lr": 0.001,
      "betas": [
        0.8,
        0.999
      ],
      "eps": 1e-8,
      "weight_decay": 3e-7
    }
  },
  "scheduler": {
    "type": "WarmupLR",
    "params": {
      "warmup_min_lr": 0,
      "warmup_max_lr": 0.001,
      "warmup_num_steps": 1000
    }
  },
  "gradient_clipping": 1.0,
  "prescale_gradients": false,
  "fp16": {
      "enabled": true,
      "fp16_master_weights_and_grads": false,
      "loss_scale": 0,
      "loss_scale_window": 500,
      "hysteresis": 2,
      "min_loss_scale": 1,
      "initial_scale_power": 15
  },
  "wall_clock_breakdown": false,
  "zero_optimization": {
      "stage": 0,
      "allgather_partitions": true,
      "reduce_scatter": true,
      "allgather_bucket_size": 50000000,
      "reduce_bucket_size": 50000000,
      "overlap_comm": true,
      "contiguous_gradients": true,
      "cpu_offload": false
  }
}


And finally, pytorch-deepspeed.py is:


File : pytorch-deepspeed.py

import os
import time
import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader

import deepspeed

import torch.distributed as dist
import torch.utils.data.distributed

import argparse

parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed test')
parser.add_argument('--local_rank', type=int, default=-1, help='local rank argument added automatically by the launcher')

parser = deepspeed.add_config_arguments(parser)

def main():

    args = parser.parse_args()

    deepspeed.init_distributed()

    class Net(nn.Module):

       def __init__(self):
          super(Net, self).__init__()

          self.conv1 = nn.Conv2d(3, 6, 5)
          self.pool = nn.MaxPool2d(2, 2)
          self.conv2 = nn.Conv2d(6, 16, 5)
          self.fc1 = nn.Linear(16 * 5 * 5, 120)
          self.fc2 = nn.Linear(120, 84)
          self.fc3 = nn.Linear(84, 10)

       def forward(self, x):
          x = self.pool(F.relu(self.conv1(x)))
          x = self.pool(F.relu(self.conv2(x)))
          x = x.view(-1, 16 * 5 * 5)
          x = F.relu(self.fc1(x))
          x = F.relu(self.fc2(x))
          x = self.fc3(x)
          return x

    net = Net()

    parameters = filter(lambda p: p.requires_grad, net.parameters())

    transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)

    model_engine, optimizer, trainloader, __ = deepspeed.initialize(args=args, model=net, model_parameters=parameters, training_data=dataset_train)

    fp16 = model_engine.fp16_enabled()

    criterion = nn.CrossEntropyLoss().cuda()

    for batch_idx, (inputs, targets) in enumerate(trainloader):

       inputs = inputs.to(model_engine.local_rank)
       inputs = inputs.half()
       targets = targets.to(model_engine.local_rank)
       outputs = model_engine(inputs)
       loss = criterion(outputs, targets)

       model_engine.backward(loss)
       model_engine.step()

if __name__=='__main__':
   main()