PyTorch: Difference between revisions

From Alliance Doc
Jump to navigation Jump to search
mNo edit summary
No edit summary
Line 86: Line 86:
You can then submit a PyTorch job with:
You can then submit a PyTorch job with:
{{Command|sbatch pytorch-test.sh}}
{{Command|sbatch pytorch-test.sh}}
= PyTorch with Multiple GPUs =
There are several ways to use PyTorch multiple GPUs. This section features tutorials on two of them: using the ‘’’DistributedDataParallel’’’ class and using the ‘’’PyTorch Lightning’’’ package.
==Using DistributedDataParallel==
The ‘’’DIstributedDataParallel’’’ class is the way [https://pytorch.org/tutorials/intermediate/ddp_tutorial.html#comparison-between-dataparallel-and-distributeddataparallel recommended by PyTorch maintainers] to use multiple GPUs, whether they are all on a single node, or distributed across multiple nodes. The following is a tutorial on multiple GPUs distributed across 2 nodes:
{{File
  |name=pytorch-ddp-test.sh
  |lang="bash"
  |contents=
#!/bin/bash
#SBATCH —nodes 2              # Request 2 node so all resources are in two nodes.
#SBATCH --gres=gpu:2          # Request 4 GPU "generic resources”. You will get 2 per node.
#SBATCH —tasks-per-node=2 # Request 1 process per GPU. Request more processes if using multiple DataLoader workers.
#SBATCH --mem=8G     
#SBATCH --time=0-03:00
#SBATCH --output=%N-%j.out
module load python/3.6
virtualenv --no-download $SLURM_TMPDIR/env
source $SLURM_TMPDIR/env/bin/activate
pip install torch --no-index
export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable.
echo "r$SLURM_NODEID master: $MASTER_ADDR"
echo "r$SLURM_NODEID Launching python script"
srun python pytorch-ddp-test.py --init_method tcp://$MASTER_ADDR:3456 --world_size $SLURM_NTASKS  --batch_size 256 #The SLURM_NTASKS variable tells the script how many processes are available for this execution. “srun” executes the script <tasks-per-node * nodes> times
}}
The Python script <code>pytorch-ddp-test.py</code> has the form
{{File
  |name=pytorch-ddp-test.py
  |lang="python"
  |contents=
import os
import time
import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
import torch.distributed as dist
import torch.utils.data.distributed
import argparse
parser = argparse.ArgumentParser(description='cifar10 classification models, distributed data parallel test')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--batch_size', type=int, default=768, help='')
parser.add_argument('--max_epochs', type=int, default=4, help='')
parser.add_argument('--num_workers', type=int, default=0, help='')
parser.add_argument('--init_method', default='tcp://127.0.0.1:3456', type=str, help='')
parser.add_argument('--dist-backend', default='gloo', type=str, help='')
parser.add_argument('--world_size', default=1, type=int, help='')
parser.add_argument('--distributed', action='store_true', help='')
def main():
    print("Starting...")
    args = parser.parse_args()
    ngpus_per_node = torch.cuda.device_count()
    print(ngpus_per_node)
    """ This next line is the key to DistributedDataParallel working on SLURM:
SLURM_NODEID is 0 or 1 in this example, SLURM_LOCALID is the id of the
current process inside a node and is also 0 or 1 in this example."""
    rank = int(os.environ.get("SLURM_NODEID"))*ngpus_per_node + int(os.environ.get("SLURM_LOCALID"))
 
    """ this block initializes a process group and initiate communications
between all processes running on all nodes """
    print('From Rank: {}, ==> Initializing Process Group...'.format(rank))
    #init the process group
    dist.init_process_group(backend=args.dist_backend, init_method=args.init_method, world_size=args.world_size, rank=rank)
    print("process group ready!")
    print('From Rank: {}, ==> Making model..'.format(rank))
    class Net(nn.Module):
      def __init__(self):
          super(Net, self).__init__()
          self.conv1 = nn.Conv2d(3, 6, 5)
          self.pool = nn.MaxPool2d(2, 2)
          self.conv2 = nn.Conv2d(6, 16, 5)
          self.fc1 = nn.Linear(16 * 5 * 5, 120)
          self.fc2 = nn.Linear(120, 84)
          self.fc3 = nn.Linear(84, 10)
      def forward(self, x):
          x = self.pool(F.relu(self.conv1(x)))
          x = self.pool(F.relu(self.conv2(x)))
          x = x.view(-1, 16 * 5 * 5)
          x = F.relu(self.fc1(x))
          x = F.relu(self.fc2(x))
          x = self.fc3(x)
          return x
    net = Net()
    net.cuda()
    net = torch.nn.parallel.DistributedDataParallel(net)
    print('From Rank: {}, ==> Preparing data..'.format(rank))
    transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
    train_sampler = torch.utils.data.distributed.DistributedSampler(dataset_train)
    train_loader = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.num_workers, sampler=train_sampler)
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4)
    for epoch in range(args.max_epochs):
        train_sampler.set_epoch(epoch)
        train(epoch, net, criterion, optimizer, train_loader, rank)
def train(epoch, net, criterion, optimizer, train_loader, train_rank):
    train_loss = 0
    correct = 0
    total = 0
    epoch_start = time.time()
    for batch_idx, (inputs, targets) in enumerate(train_loader):
      start = time.time()
      inputs = inputs.cuda()
      targets = targets.cuda()
      outputs = net(inputs)
      loss = criterion(outputs, targets)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      train_loss += loss.item()
      _, predicted = outputs.max(1)
      total += targets.size(0)
      correct += predicted.eq(targets).sum().item()
      acc = 100 * correct / total
      batch_time = time.time() - start
      elapse_time = time.time() - epoch_start
      elapse_time = datetime.timedelta(seconds=elapse_time)
      print("From Rank: {}, Training time {}".format(train_rank, elapse_time))
if __name__=='__main__':
  main()
}}


<!-- this section is hidden until results are verified.
<!-- this section is hidden until results are verified.

Revision as of 20:07, 2 November 2020

Other languages:

PyTorch is a Python package that provides two high-level features:

  • Tensor computation (like NumPy) with strong GPU acceleration
  • Deep neural networks built on a tape-based autograd system

If you are porting a PyTorch program to a Compute Canada cluster, you should follow our tutorial on the subject.

Disambiguation

PyTorch has a distant connection with Torch, but for all practical purposes you can treat them as separate projects.

PyTorch developers also offer LibTorch, which allows one to implement extensions to PyTorch using C++, and to implement pure C++ machine learning applications. Models written in Python using PyTorch can be converted and used in pure C++ through TorchScript.

Installation

Latest available wheels

To see the latest version of PyTorch that we have built:

Question.png
[name@server ~]$ avail_wheels "torch*"

For more information on listing wheels, see listing available wheels.

Installing Compute Canada wheel

The preferred option is to install it using the Python wheel as follows:

1. Load a Python module, either python/2.7, python/3.5, python/3.6 or python/3.7
2. Create and start a virtual environment.
3. Install PyTorch in the virtual environment with pip install.

GPU and CPU

Question.png
(venv) [name@server ~] pip install --no-index torch

Extra

In addition to torch, you can install torchvision, torchtext and torchaudio:

Question.png
(venv) [name@server ~] pip install --no-index torch torchvision torchtext torchaudio

Job submission

Here is an example of a job submission script using the python wheel, with a virtual environment inside a job:

File : pytorch-test.sh

#!/bin/bash
#SBATCH --gres=gpu:1       # Request GPU "generic resources"
#SBATCH --cpus-per-task=6  # Cores proportional to GPUs: 6 on Cedar, 16 on Graham.
#SBATCH --mem=32000M       # Memory proportional to GPUs: 32000 Cedar, 64000 Graham.
#SBATCH --time=0-03:00
#SBATCH --output=%N-%j.out

module load python/3.6
virtualenv --no-download $SLURM_TMPDIR/env
source $SLURM_TMPDIR/env/bin/activate
pip install torch --no-index

python pytorch-test.py


The Python script pytorch-test.py has the form

File : pytorch-test.py

import torch
x = torch.Tensor(5, 3)
print(x)
y = torch.rand(5, 3)
print(y)
# let us run the following only if CUDA is available
if torch.cuda.is_available():
    x = x.cuda()
    y = y.cuda()
    print(x + y)


You can then submit a PyTorch job with:

Question.png
[name@server ~]$ sbatch pytorch-test.sh

PyTorch with Multiple GPUs

There are several ways to use PyTorch multiple GPUs. This section features tutorials on two of them: using the ‘’’DistributedDataParallel’’’ class and using the ‘’’PyTorch Lightning’’’ package.

Using DistributedDataParallel

The ‘’’DIstributedDataParallel’’’ class is the way recommended by PyTorch maintainers to use multiple GPUs, whether they are all on a single node, or distributed across multiple nodes. The following is a tutorial on multiple GPUs distributed across 2 nodes:


File : pytorch-ddp-test.sh

#!/bin/bash
#SBATCH —nodes 2              # Request 2 node so all resources are in two nodes.
#SBATCH --gres=gpu:2          # Request 4 GPU "generic resources”. You will get 2 per node.
#SBATCH —tasks-per-node=2 # Request 1 process per GPU. Request more processes if using multiple DataLoader workers.
#SBATCH --mem=8G      
#SBATCH --time=0-03:00
#SBATCH --output=%N-%j.out

module load python/3.6
virtualenv --no-download $SLURM_TMPDIR/env
source $SLURM_TMPDIR/env/bin/activate
pip install torch --no-index

export MASTER_ADDR=$(hostname) #Store the master node’s IP address in the MASTER_ADDR environment variable.

echo "r$SLURM_NODEID master: $MASTER_ADDR"


echo "r$SLURM_NODEID Launching python script"
srun python pytorch-ddp-test.py --init_method tcp://$MASTER_ADDR:3456 --world_size $SLURM_NTASKS  --batch_size 256 #The SLURM_NTASKS variable tells the script how many processes are available for this execution. “srun” executes the script <tasks-per-node * nodes> times


The Python script pytorch-ddp-test.py has the form

File : pytorch-ddp-test.py

import os
import time
import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader

import torch.distributed as dist
import torch.utils.data.distributed

import argparse


parser = argparse.ArgumentParser(description='cifar10 classification models, distributed data parallel test')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--batch_size', type=int, default=768, help='')
parser.add_argument('--max_epochs', type=int, default=4, help='')
parser.add_argument('--num_workers', type=int, default=0, help='')

parser.add_argument('--init_method', default='tcp://127.0.0.1:3456', type=str, help='')
parser.add_argument('--dist-backend', default='gloo', type=str, help='')
parser.add_argument('--world_size', default=1, type=int, help='')
parser.add_argument('--distributed', action='store_true', help='')


def main():
    print("Starting...")

    args = parser.parse_args()

    ngpus_per_node = torch.cuda.device_count()

    print(ngpus_per_node)

    """ This next line is the key to DistributedDataParallel working on SLURM:
		SLURM_NODEID is 0 or 1 in this example, SLURM_LOCALID is the id of the 
 		current process inside a node and is also 0 or 1 in this example."""

    rank = int(os.environ.get("SLURM_NODEID"))*ngpus_per_node + int(os.environ.get("SLURM_LOCALID")) 

  
    """ this block initializes a process group and initiate communications
		between all processes running on all nodes """

    print('From Rank: {}, ==> Initializing Process Group...'.format(rank))
    #init the process group
    dist.init_process_group(backend=args.dist_backend, init_method=args.init_method, world_size=args.world_size, rank=rank)
    print("process group ready!")

    print('From Rank: {}, ==> Making model..'.format(rank))


    class Net(nn.Module):

       def __init__(self):
          super(Net, self).__init__()

          self.conv1 = nn.Conv2d(3, 6, 5)
          self.pool = nn.MaxPool2d(2, 2)
          self.conv2 = nn.Conv2d(6, 16, 5)
          self.fc1 = nn.Linear(16 * 5 * 5, 120)
          self.fc2 = nn.Linear(120, 84)
          self.fc3 = nn.Linear(84, 10)

       def forward(self, x):
          x = self.pool(F.relu(self.conv1(x)))
          x = self.pool(F.relu(self.conv2(x)))
          x = x.view(-1, 16 * 5 * 5)
          x = F.relu(self.fc1(x))
          x = F.relu(self.fc2(x))
          x = self.fc3(x)
          return x

    net = Net()

    net.cuda()
    net = torch.nn.parallel.DistributedDataParallel(net)

    print('From Rank: {}, ==> Preparing data..'.format(rank))

    transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)

    train_sampler = torch.utils.data.distributed.DistributedSampler(dataset_train)
    train_loader = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.num_workers, sampler=train_sampler)


    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4)

    for epoch in range(args.max_epochs):

        train_sampler.set_epoch(epoch)

        train(epoch, net, criterion, optimizer, train_loader, rank)


def train(epoch, net, criterion, optimizer, train_loader, train_rank):

    train_loss = 0
    correct = 0
    total = 0

    epoch_start = time.time()

    for batch_idx, (inputs, targets) in enumerate(train_loader):

       start = time.time()

       inputs = inputs.cuda()
       targets = targets.cuda()
       outputs = net(inputs)
       loss = criterion(outputs, targets)

       optimizer.zero_grad()
       loss.backward()
       optimizer.step()

       train_loss += loss.item()
       _, predicted = outputs.max(1)
       total += targets.size(0)
       correct += predicted.eq(targets).sum().item()
       acc = 100 * correct / total

       batch_time = time.time() - start


       elapse_time = time.time() - epoch_start
       elapse_time = datetime.timedelta(seconds=elapse_time)
       print("From Rank: {}, Training time {}".format(train_rank, elapse_time))

if __name__=='__main__':
   main()



Troubleshooting

Memory leak

On AVX512 hardware (Béluga, Skylake or V100 nodes), older versions of Pytorch (less than v1.0.1) using older libraries (cuDNN < v7.5 or MAGMA < v2.5) may considerably leak memory resulting in an out-of-memory exception and death of your tasks. Please upgrade to the latest torch version.

LibTorch

LibTorch allows one to implement both C++ extensions to PyTorch and pure C++ machine learning applications. It contains "all headers, libraries and CMake configuration files required to depend on PyTorch" (as mentioned in the docs).

How to use LibTorch

Get the library

wget https://download.pytorch.org/libtorch/cu100/libtorch-shared-with-deps-latest.zip
unzip libtorch-shared-with-deps-latest.zip
cd libtorch
export LIBTORCH_ROOT=$(pwd)  # this variable is used in the example below

Patch the library (this workaround is needed for compiling on Compute Canada clusters):

sed -i -e 's/\/usr\/local\/cuda\/lib64\/libculibos.a;dl;\/usr\/local\/cuda\/lib64\/libculibos.a;//g' share/cmake/Caffe2/Caffe2Targets.cmake

Compile a minimal example

Create the following two files:


File : example-app.cpp

#include <torch/torch.h>
#include <iostream>

int main() {
    torch::Device device(torch::kCPU);
    if (torch::cuda::is_available()) {
        std::cout << "CUDA is available! Using GPU." << std::endl;
        device = torch::Device(torch::kCUDA);
    }

    torch::Tensor tensor = torch::rand({2, 3}).to(device);
    std::cout << tensor << std::endl;
}



File : CMakeLists.txt

cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
project(example-app)

find_package(Torch REQUIRED)

add_executable(example-app example-app.cpp)
target_link_libraries(example-app "${TORCH_LIBRARIES}")
set_property(TARGET example-app PROPERTY CXX_STANDARD 11)


Load the necessary modules:

module load cmake intel/2018.3 cuda/10 cudnn

Compile the program:

mkdir build
cd build
cmake -DCMAKE_PREFIX_PATH="$LIBTORCH_ROOT;$EBROOTCUDA;$EBROOTCUDNN" ..
make

Run the program:

./example-app

To test an application with CUDA, request an interactive job with a GPU.

Resources

https://pytorch.org/cppdocs/