PyTorch: Difference between revisions

578 bytes removed ,  1 year ago
m
no edit summary
mNo edit summary
mNo edit summary
Line 1,771: Line 1,771:
#!/bin/bash
#!/bin/bash
#SBATCH --nodes 1             
#SBATCH --nodes 1             
#SBATCH --gres=gpu:4         # Request 2 GPU "generic resources”.  
#SBATCH --gres=gpu:2         # Request 2 GPU "generic resources”.  
#SBATCH --tasks-per-node=2    # Request 1 process per GPU. You will get 1 CPU per process by default. Request more CPUs with the "cpus-per-task" parameter to enable multiple data-loader workers to load data in parallel.
#SBATCH --tasks-per-node=2    # Request 1 process per GPU. You will get 1 CPU per process by default. Request more CPUs with the "cpus-per-task" parameter to enable multiple data-loader workers to load data in parallel.
#SBATCH --mem=32G       
#SBATCH --mem=32G       
Line 1,801: Line 1,801:
   |lang="python"
   |lang="python"
   |contents=
   |contents=
import datetime
import os
import os


<!--T:505-->
import torch
import torch
from torch import nn
from torch import nn
import torch.nn.functional as F
import torch.nn.functional as F


<!--T:506-->
import pytorch_lightning as pl
import pytorch_lightning as pl


<!--T:507-->
import torchvision
import torchvision
import torchvision.transforms as transforms
import torchvision.transforms as transforms
Line 1,818: Line 1,814:
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader


<!--T:508-->
from deepspeed.ops.adam import DeepSpeedCPUAdam
from deepspeed.ops.adam import DeepSpeedCPUAdam,FusedAdam
from pytorch_lightning.strategies import DeepSpeedStrategy
from pytorch_lightning.plugins import DeepSpeedPlugin


<!--T:509-->
import argparse
import argparse


<!--T:510-->
parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed offload to cpu test')
parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed offload to disk test')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--max_epochs', type=int, default=2, help='')
parser.add_argument('--max_epochs', type=int, default=2, help='')
Line 1,833: Line 1,826:




<!--T:511-->
def main():
def main():
     print("Starting...")
     print("Starting...")


     <!--T:512-->
     args = parser.parse_args()
args = parser.parse_args()


     <!--T:513-->
     class ConvPart(nn.Module):
class ConvPart(nn.Module):


       <!--T:514-->
       def __init__(self):
def __init__(self):
           super(ConvPart, self).__init__()
           super(ConvPart, self).__init__()


           <!--T:515-->
           self.conv1 = nn.Conv2d(3, 6, 5)
self.conv1 = nn.Conv2d(3, 6, 5)
           self.pool = nn.MaxPool2d(2, 2)
           self.pool = nn.MaxPool2d(2, 2)
           self.conv2 = nn.Conv2d(6, 16, 5)
           self.conv2 = nn.Conv2d(6, 16, 5)
           self.relu = nn.ReLU()
           self.relu = nn.ReLU()


       <!--T:516-->
       def forward(self, x):
def forward(self, x):
           x = self.pool(self.relu(self.conv1(x)))
           x = self.pool(self.relu(self.conv1(x)))
           x = self.pool(self.relu(self.conv2(x)))
           x = self.pool(self.relu(self.conv2(x)))
           x = x.view(-1, 16 * 5 * 5)
           x = x.view(-1, 16 * 5 * 5)


           <!--T:517-->
           return x
return x


     <!--T:518-->
     # Dense feedforward part of the model
# Dense feedforward part of the model
     class MLPPart(nn.Module):
     class MLPPart(nn.Module):


       <!--T:519-->
       def __init__(self):
def __init__(self):
           super(MLPPart, self).__init__()
           super(MLPPart, self).__init__()


           <!--T:520-->
           self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
           self.fc2 = nn.Linear(120, 84)
           self.fc2 = nn.Linear(120, 84)
           self.fc3 = nn.Linear(84, 10)
           self.fc3 = nn.Linear(84, 10)
           self.relu = nn.ReLU()
           self.relu = nn.ReLU()


       <!--T:521-->
       def forward(self, x):
def forward(self, x):
           x = self.relu(self.fc1(x))
           x = self.relu(self.fc1(x))
           x = self.relu(self.fc2(x))
           x = self.relu(self.fc2(x))
           x = self.fc3(x)
           x = self.fc3(x)


           <!--T:522-->
           return x
return x


     <!--T:523-->
     class Net(pl.LightningModule):
class Net(pl.LightningModule):


       <!--T:524-->
       def __init__(self):
def __init__(self):
           super(Net, self).__init__()
           super(Net, self).__init__()


           <!--T:525-->
           self.conv_part = ConvPart()
self.conv_part = ConvPart()
           self.mlp_part = MLPPart()
           self.mlp_part = MLPPart()


       <!--T:526-->
       def configure_sharded_model(self):
def configure_sharded_model(self):


           <!--T:527-->
           self.block = nn.Sequential(self.conv_part, self.mlp_part)
self.block = nn.Sequential(self.conv_part, self.mlp_part)


       <!--T:528-->
       def forward(self, x):
def forward(self, x):
           x = self.block(x)
           x = self.block(x)


           <!--T:529-->
           return x
return x


       <!--T:530-->
       def training_step(self, batch, batch_idx):
def training_step(self, batch, batch_idx):
           x, y = batch
           x, y = batch
           y_hat = self(x)
           y_hat = self(x)
Line 1,916: Line 1,889:
           return loss
           return loss


       <!--T:531-->
       def configure_optimizers(self):
def configure_optimizers(self):
           return DeepSpeedCPUAdam(self.parameters())
           return DeepSpeedCPUAdam(self.parameters())


     <!--T:532-->
     net = Net()
net = Net()


     <!--T:533-->
     """ Here we initialize a Trainer() explicitly with 1 node and 2 GPU.
""" Here we initialize a Trainer() explicitly with 2 nodes and 2 GPUs per node.
         To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs
         To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs
         and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes.  
         and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes.  
Line 1,930: Line 1,900:
         which can cause issues due to updating logs too frequently."""
         which can cause issues due to updating logs too frequently."""


     <!--T:534-->
     local_scratch = os.environ['SLURM_TMPDIR'] # Get path where local storage is mounted
local_scratch = os.environ['SLURM_TMPDIR'] # Get path where local storage is mounted


     <!--T:535-->
     print(f'Offloading to: {local_scratch}')
print(f'Offloading to: {local_scratch}')


     <!--T:536-->
     trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy=DeepSpeedStrategy(
trainer = pl.Trainer(gpus=2, num_nodes=2,strategy=DeepSpeedPlugin(
         stage=3,
         stage=3,
         offload_optimizer=True,
         offload_optimizer=True,
Line 1,944: Line 1,911:
         offload_params_device="nvme",
         offload_params_device="nvme",
         offload_optimizer_device="nvme",
         offload_optimizer_device="nvme",
         nvme_path=local_scratch,
         nvme_path="local_scratch",
         ),checkpoint_callback=False, max_epochs = args.max_epochs) # Disable PyTorch Lightning checkpointing when offloading to disk
         ), max_epochs = args.max_epochs)


     <!--T:537-->
     transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


     <!--T:538-->
     dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)


     <!--T:539-->
     train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)


     <!--T:540-->
     trainer.fit(net,train_loader)
trainer.fit(net,train_loader)




<!--T:541-->
if __name__=='__main__':
if __name__=='__main__':
   main()
   main()
}}
}}


cc_staff
282

edits