Huggingface: Difference between revisions

Jump to navigation Jump to search
no edit summary
No edit summary
No edit summary
Line 92: Line 92:
=Accelerate=
=Accelerate=
Accelerate is a package that enables any PyTorch code to be run across any distributed configuration by adding just four lines of code. This makes training and inference at scale simple, efficient and adaptable.
Accelerate is a package that enables any PyTorch code to be run across any distributed configuration by adding just four lines of code. This makes training and inference at scale simple, efficient and adaptable.
In the example that follows, we use <tt>accelerate</tt> to reproduce our [[PyTorch#PyTorch_with_Multiple_GPUs|PyTorch tutorial]] on how to train a model with multiple GPUs distributed over multiple nodes. Notable differences are:
:1. Here we ask for only one task per node and we let <tt>accelerate</tt> handle starting the appropriate number of processes (one per GPU) on each node.
:2. We pass the number of nodes in the job and the individual node ids in the job to accelerate via the <tt>machine_rank</tt> and <tt>num_machines</tt> arguments respectively. Accelerate handles setting global and local ranks internally.
{{File
  |name=accelerate-example.sh
  |lang="bash"
  |contents=
#!/bin/bash
#SBATCH --nodes 2
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-task=2
#SBATCH --cpus-per-task=4
#SBATCH --mem=16000M     
#SBATCH --time=0-00:10
#SBATCH --output=%N-%j.out
## Create a virtualenv and install accelerate + its dependencies on all nodes ##
srun -N $SLURM_NNODES -n $SLURM_NNODES config_env.sh
export HEAD_NODE=$(hostname) # store head node's address
export HEAD_NODE_PORT=34567 # choose a port on the main node to start accelerate's main process
srun launch_training_accelerate.sh
}}
Where the script <code>config_env.sh</code> is:
{{File
  |name=config_env.sh
  |lang="bash"
  |contents=
#!/bin/bash
module load python
virtualenv --no-download $SLURM_TMPDIR/ENV
source $SLURM_TMPDIR/ENV/bin/activate
pip install --upgrade pip --no-index
pip install --no-index torchvision accelerate
echo "Done installing virtualenv!"
}}
The script <code>launch_training_accelerate.sh</code> is:
{{File
  |name=launch_training_accelerate.sh
  |lang="bash"
  |contents=
#!/bin/bash
source $SLURM_TMPDIR/env/bin/activate
export NCCL_ASYNC_ERROR_HANDLING=1
echo "Node $SLURM_NODEID says: main node at $HEAD_NODE"
echo "Node $SLURM_NODEID says: Launching python script with accelerate..."
accelerate launch \
--multi_gpu \
--gpu_ids="all" \
--num_machines=$SLURM_NNODES \
--machine_rank=$SLURM_NODEID \
--num_processes=4 \ # This is the total number of GPUs across all nodes
--main_process_ip="$HEAD_NODE" \
--main_process_port=$HEAD_NODE_PORT \
pytorch-accelerate.py --batch_size 256 --num_workers=2
}}
And finally, <code>pytorch-accelerate.py</code> is:
{{File
  |name=pytorch-accelerate.py
  |lang="python"
  |contents=
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
import torch.utils.data.distributed
from accelerate import Accelerator
import argparse
parser = argparse.ArgumentParser(description='cifar10 classification models, distributed data parallel test')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--batch_size', type=int, default=64, help='')
parser.add_argument('--num_workers', type=int, default=0, help='')
def main():
    print("Starting...")
    args = parser.parse_args()
    accelerator = Accelerator()
    device = accelerator.device
    class Net(nn.Module):
      def __init__(self):
          super(Net, self).__init__()
          self.conv1 = nn.Conv2d(3, 6, 5)
          self.pool = nn.MaxPool2d(2, 2)
          self.conv2 = nn.Conv2d(6, 16, 5)
          self.fc1 = nn.Linear(16 * 5 * 5, 120)
          self.fc2 = nn.Linear(120, 84)
          self.fc3 = nn.Linear(84, 10)
      def forward(self, x):
          x = self.pool(F.relu(self.conv1(x)))
          x = self.pool(F.relu(self.conv2(x)))
          x = x.view(-1, 16 * 5 * 5)
          x = F.relu(self.fc1(x))
          x = F.relu(self.fc2(x))
          x = self.fc3(x)
          return x
    net = Net()
    net.to(device)
    transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
    train_sampler = torch.utils.data.distributed.DistributedSampler(dataset_train)
    train_loader = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.num_workers, sampler=train_sampler)
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4)
    net, optimizer, train_loader = accelerator.prepare(net, optimizer, train_loader)
    for batch in train_loader:
      inputs,targets = batch
      outputs = net(inputs)
      loss = criterion(outputs, targets)
      accelerator.backward(loss)
      optimizer.step()
      print("Done!")
if __name__=='__main__':
  main()
                                   
}}
cc_staff
282

edits

Navigation menu