PyTorch: Difference between revisions

Jump to navigation Jump to search
1,404 bytes added ,  1 year ago
Marked this version for translation
No edit summary
(Marked this version for translation)
Line 1,491: Line 1,491:
import torch.nn.functional as F
import torch.nn.functional as F


<!--T:561-->
import pytorch_lightning as pl
import pytorch_lightning as pl


<!--T:562-->
import torchvision
import torchvision
import torchvision.transforms as transforms
import torchvision.transforms as transforms
Line 1,498: Line 1,500:
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader


<!--T:563-->
from deepspeed.ops.adam import FusedAdam
from deepspeed.ops.adam import FusedAdam
from pytorch_lightning.strategies import DeepSpeedStrategy
from pytorch_lightning.strategies import DeepSpeedStrategy


<!--T:564-->
import argparse
import argparse


<!--T:565-->
parser = argparse.ArgumentParser(description='cifar10 classification models deep seed stage 3 test')
parser = argparse.ArgumentParser(description='cifar10 classification models deep seed stage 3 test')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--lr', default=0.1, help='')
Line 1,510: Line 1,515:




<!--T:566-->
def main():
def main():
     print("Starting...")
     print("Starting...")


     args = parser.parse_args()
     <!--T:567-->
args = parser.parse_args()


     class ConvPart(nn.Module):
     <!--T:568-->
class ConvPart(nn.Module):


       def __init__(self):
       <!--T:569-->
def __init__(self):
           super(ConvPart, self).__init__()
           super(ConvPart, self).__init__()


           self.conv1 = nn.Conv2d(3, 6, 5)
           <!--T:570-->
self.conv1 = nn.Conv2d(3, 6, 5)
           self.pool = nn.MaxPool2d(2, 2)
           self.pool = nn.MaxPool2d(2, 2)
           self.conv2 = nn.Conv2d(6, 16, 5)
           self.conv2 = nn.Conv2d(6, 16, 5)
           self.relu = nn.ReLU()
           self.relu = nn.ReLU()


       def forward(self, x):
       <!--T:571-->
def forward(self, x):
           x = self.pool(self.relu(self.conv1(x)))
           x = self.pool(self.relu(self.conv1(x)))
           x = self.pool(self.relu(self.conv2(x)))
           x = self.pool(self.relu(self.conv2(x)))
           x = x.view(-1, 16 * 5 * 5)
           x = x.view(-1, 16 * 5 * 5)


           return x
           <!--T:572-->
return x


     # Dense feedforward part of the model
     <!--T:573-->
# Dense feedforward part of the model
     class MLPPart(nn.Module):
     class MLPPart(nn.Module):


       def __init__(self):
       <!--T:574-->
def __init__(self):
           super(MLPPart, self).__init__()
           super(MLPPart, self).__init__()


           self.fc1 = nn.Linear(16 * 5 * 5, 120)
           <!--T:575-->
self.fc1 = nn.Linear(16 * 5 * 5, 120)
           self.fc2 = nn.Linear(120, 84)
           self.fc2 = nn.Linear(120, 84)
           self.fc3 = nn.Linear(84, 10)
           self.fc3 = nn.Linear(84, 10)
           self.relu = nn.ReLU()
           self.relu = nn.ReLU()


       def forward(self, x):
       <!--T:576-->
def forward(self, x):
           x = self.relu(self.fc1(x))
           x = self.relu(self.fc1(x))
           x = self.relu(self.fc2(x))
           x = self.relu(self.fc2(x))
           x = self.fc3(x)
           x = self.fc3(x)


           return x
           <!--T:577-->
return x


     class Net(pl.LightningModule):
     <!--T:578-->
class Net(pl.LightningModule):


       def __init__(self):
       <!--T:579-->
def __init__(self):
           super(Net, self).__init__()
           super(Net, self).__init__()


           self.conv_part = ConvPart()
           <!--T:580-->
self.conv_part = ConvPart()
           self.mlp_part = MLPPart()
           self.mlp_part = MLPPart()


       def configure_sharded_model(self):
       <!--T:581-->
def configure_sharded_model(self):


           self.block = nn.Sequential(self.conv_part, self.mlp_part)
           <!--T:582-->
self.block = nn.Sequential(self.conv_part, self.mlp_part)


       def forward(self, x):
       <!--T:583-->
def forward(self, x):
           x = self.block(x)
           x = self.block(x)


           return x
           <!--T:584-->
return x


       def training_step(self, batch, batch_idx):
       <!--T:585-->
def training_step(self, batch, batch_idx):
           x, y = batch
           x, y = batch
           y_hat = self(x)
           y_hat = self(x)
Line 1,573: Line 1,598:
           return loss
           return loss


       def configure_optimizers(self):
       <!--T:586-->
def configure_optimizers(self):
           return FusedAdam(self.parameters())
           return FusedAdam(self.parameters())


     net = Net()
     <!--T:587-->
net = Net()


     """ Here we initialize a Trainer() explicitly with 1 node and 2 GPU.
     <!--T:588-->
""" Here we initialize a Trainer() explicitly with 1 node and 2 GPU.
         To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs
         To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs
         and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes.  
         and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes.  
Line 1,584: Line 1,612:
         which can cause issues due to updating logs too frequently."""
         which can cause issues due to updating logs too frequently."""


     trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy="deepspeed_stage_3", max_epochs = args.max_epochs)
     <!--T:589-->
trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy="deepspeed_stage_3", max_epochs = args.max_epochs)


     transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
     <!--T:590-->
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


     dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
     <!--T:591-->
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)


     train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)
     <!--T:592-->
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)


     trainer.fit(net,train_loader)
     <!--T:593-->
trainer.fit(net,train_loader)




<!--T:594-->
if __name__=='__main__':
if __name__=='__main__':
   main()
   main()


<!--T:595-->
}}
}}


Line 1,643: Line 1,678:
   |contents=
   |contents=


<!--T:596-->
import torch
import torch
from torch import nn
from torch import nn
import torch.nn.functional as F
import torch.nn.functional as F


<!--T:597-->
import pytorch_lightning as pl
import pytorch_lightning as pl


<!--T:598-->
import torchvision
import torchvision
import torchvision.transforms as transforms
import torchvision.transforms as transforms
Line 1,654: Line 1,692:
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader


<!--T:599-->
from deepspeed.ops.adam import DeepSpeedCPUAdam
from deepspeed.ops.adam import DeepSpeedCPUAdam
from pytorch_lightning.strategies import DeepSpeedStrategy
from pytorch_lightning.strategies import DeepSpeedStrategy


<!--T:600-->
import argparse
import argparse


<!--T:601-->
parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed offload to cpu test')
parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed offload to cpu test')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--lr', default=0.1, help='')
Line 1,666: Line 1,707:




<!--T:602-->
def main():
def main():
     print("Starting...")
     print("Starting...")


     args = parser.parse_args()
     <!--T:603-->
args = parser.parse_args()


     class ConvPart(nn.Module):
     <!--T:604-->
class ConvPart(nn.Module):


       def __init__(self):
       <!--T:605-->
def __init__(self):
           super(ConvPart, self).__init__()
           super(ConvPart, self).__init__()


           self.conv1 = nn.Conv2d(3, 6, 5)
           <!--T:606-->
self.conv1 = nn.Conv2d(3, 6, 5)
           self.pool = nn.MaxPool2d(2, 2)
           self.pool = nn.MaxPool2d(2, 2)
           self.conv2 = nn.Conv2d(6, 16, 5)
           self.conv2 = nn.Conv2d(6, 16, 5)
           self.relu = nn.ReLU()
           self.relu = nn.ReLU()


       def forward(self, x):
       <!--T:607-->
def forward(self, x):
           x = self.pool(self.relu(self.conv1(x)))
           x = self.pool(self.relu(self.conv1(x)))
           x = self.pool(self.relu(self.conv2(x)))
           x = self.pool(self.relu(self.conv2(x)))
           x = x.view(-1, 16 * 5 * 5)
           x = x.view(-1, 16 * 5 * 5)


           return x
           <!--T:608-->
return x


     # Dense feedforward part of the model
     <!--T:609-->
# Dense feedforward part of the model
     class MLPPart(nn.Module):
     class MLPPart(nn.Module):


       def __init__(self):
       <!--T:610-->
def __init__(self):
           super(MLPPart, self).__init__()
           super(MLPPart, self).__init__()


           self.fc1 = nn.Linear(16 * 5 * 5, 120)
           <!--T:611-->
self.fc1 = nn.Linear(16 * 5 * 5, 120)
           self.fc2 = nn.Linear(120, 84)
           self.fc2 = nn.Linear(120, 84)
           self.fc3 = nn.Linear(84, 10)
           self.fc3 = nn.Linear(84, 10)
           self.relu = nn.ReLU()
           self.relu = nn.ReLU()


       def forward(self, x):
       <!--T:612-->
def forward(self, x):
           x = self.relu(self.fc1(x))
           x = self.relu(self.fc1(x))
           x = self.relu(self.fc2(x))
           x = self.relu(self.fc2(x))
           x = self.fc3(x)
           x = self.fc3(x)


           return x
           <!--T:613-->
return x


     class Net(pl.LightningModule):
     <!--T:614-->
class Net(pl.LightningModule):


       def __init__(self):
       <!--T:615-->
def __init__(self):
           super(Net, self).__init__()
           super(Net, self).__init__()


           self.conv_part = ConvPart()
           <!--T:616-->
self.conv_part = ConvPart()
           self.mlp_part = MLPPart()
           self.mlp_part = MLPPart()


       def configure_sharded_model(self):
       <!--T:617-->
def configure_sharded_model(self):


           self.block = nn.Sequential(self.conv_part, self.mlp_part)
           <!--T:618-->
self.block = nn.Sequential(self.conv_part, self.mlp_part)


       def forward(self, x):
       <!--T:619-->
def forward(self, x):
           x = self.block(x)
           x = self.block(x)


           return x
           <!--T:620-->
return x


       def training_step(self, batch, batch_idx):
       <!--T:621-->
def training_step(self, batch, batch_idx):
           x, y = batch
           x, y = batch
           y_hat = self(x)
           y_hat = self(x)
Line 1,729: Line 1,790:
           return loss
           return loss


       def configure_optimizers(self):
       <!--T:622-->
def configure_optimizers(self):
           return DeepSpeedCPUAdam(self.parameters())
           return DeepSpeedCPUAdam(self.parameters())


     net = Net()
     <!--T:623-->
net = Net()


     """ Here we initialize a Trainer() explicitly with 1 node and 2 GPU.
     <!--T:624-->
""" Here we initialize a Trainer() explicitly with 1 node and 2 GPU.
         To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs
         To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs
         and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes.  
         and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes.  
Line 1,740: Line 1,804:
         which can cause issues due to updating logs too frequently."""
         which can cause issues due to updating logs too frequently."""


     trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy=DeepSpeedStrategy(
     <!--T:625-->
trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy=DeepSpeedStrategy(
         stage=3,
         stage=3,
         offload_optimizer=True,
         offload_optimizer=True,
Line 1,746: Line 1,811:
         ), max_epochs = args.max_epochs)
         ), max_epochs = args.max_epochs)


     transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
     <!--T:626-->
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


     dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
     <!--T:627-->
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)


     train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)
     <!--T:628-->
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)


     trainer.fit(net,train_loader)
     <!--T:629-->
trainer.fit(net,train_loader)




<!--T:630-->
if __name__=='__main__':
if __name__=='__main__':
   main()
   main()
Line 1,803: Line 1,873:
import os
import os


<!--T:631-->
import torch
import torch
from torch import nn
from torch import nn
import torch.nn.functional as F
import torch.nn.functional as F


<!--T:632-->
import pytorch_lightning as pl
import pytorch_lightning as pl


<!--T:633-->
import torchvision
import torchvision
import torchvision.transforms as transforms
import torchvision.transforms as transforms
Line 1,814: Line 1,887:
from torch.utils.data import DataLoader
from torch.utils.data import DataLoader


<!--T:634-->
from deepspeed.ops.adam import DeepSpeedCPUAdam
from deepspeed.ops.adam import DeepSpeedCPUAdam
from pytorch_lightning.strategies import DeepSpeedStrategy
from pytorch_lightning.strategies import DeepSpeedStrategy


<!--T:635-->
import argparse
import argparse


<!--T:636-->
parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed offload to nvme test')
parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed offload to nvme test')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--lr', default=0.1, help='')
Line 1,826: Line 1,902:




<!--T:637-->
def main():
def main():
     print("Starting...")
     print("Starting...")


     args = parser.parse_args()
     <!--T:638-->
args = parser.parse_args()


     class ConvPart(nn.Module):
     <!--T:639-->
class ConvPart(nn.Module):


       def __init__(self):
       <!--T:640-->
def __init__(self):
           super(ConvPart, self).__init__()
           super(ConvPart, self).__init__()


           self.conv1 = nn.Conv2d(3, 6, 5)
           <!--T:641-->
self.conv1 = nn.Conv2d(3, 6, 5)
           self.pool = nn.MaxPool2d(2, 2)
           self.pool = nn.MaxPool2d(2, 2)
           self.conv2 = nn.Conv2d(6, 16, 5)
           self.conv2 = nn.Conv2d(6, 16, 5)
           self.relu = nn.ReLU()
           self.relu = nn.ReLU()


       def forward(self, x):
       <!--T:642-->
def forward(self, x):
           x = self.pool(self.relu(self.conv1(x)))
           x = self.pool(self.relu(self.conv1(x)))
           x = self.pool(self.relu(self.conv2(x)))
           x = self.pool(self.relu(self.conv2(x)))
           x = x.view(-1, 16 * 5 * 5)
           x = x.view(-1, 16 * 5 * 5)


           return x
           <!--T:643-->
return x


     # Dense feedforward part of the model
     <!--T:644-->
# Dense feedforward part of the model
     class MLPPart(nn.Module):
     class MLPPart(nn.Module):


       def __init__(self):
       <!--T:645-->
def __init__(self):
           super(MLPPart, self).__init__()
           super(MLPPart, self).__init__()


           self.fc1 = nn.Linear(16 * 5 * 5, 120)
           <!--T:646-->
self.fc1 = nn.Linear(16 * 5 * 5, 120)
           self.fc2 = nn.Linear(120, 84)
           self.fc2 = nn.Linear(120, 84)
           self.fc3 = nn.Linear(84, 10)
           self.fc3 = nn.Linear(84, 10)
           self.relu = nn.ReLU()
           self.relu = nn.ReLU()


       def forward(self, x):
       <!--T:647-->
def forward(self, x):
           x = self.relu(self.fc1(x))
           x = self.relu(self.fc1(x))
           x = self.relu(self.fc2(x))
           x = self.relu(self.fc2(x))
           x = self.fc3(x)
           x = self.fc3(x)


           return x
           <!--T:648-->
return x


     class Net(pl.LightningModule):
     <!--T:649-->
class Net(pl.LightningModule):


       def __init__(self):
       <!--T:650-->
def __init__(self):
           super(Net, self).__init__()
           super(Net, self).__init__()


           self.conv_part = ConvPart()
           <!--T:651-->
self.conv_part = ConvPart()
           self.mlp_part = MLPPart()
           self.mlp_part = MLPPart()


       def configure_sharded_model(self):
       <!--T:652-->
def configure_sharded_model(self):


           self.block = nn.Sequential(self.conv_part, self.mlp_part)
           <!--T:653-->
self.block = nn.Sequential(self.conv_part, self.mlp_part)


       def forward(self, x):
       <!--T:654-->
def forward(self, x):
           x = self.block(x)
           x = self.block(x)


           return x
           <!--T:655-->
return x


       def training_step(self, batch, batch_idx):
       <!--T:656-->
def training_step(self, batch, batch_idx):
           x, y = batch
           x, y = batch
           y_hat = self(x)
           y_hat = self(x)
Line 1,889: Line 1,985:
           return loss
           return loss


       def configure_optimizers(self):
       <!--T:657-->
def configure_optimizers(self):
           return DeepSpeedCPUAdam(self.parameters())
           return DeepSpeedCPUAdam(self.parameters())


     net = Net()
     <!--T:658-->
net = Net()


     """ Here we initialize a Trainer() explicitly with 1 node and 2 GPU.
     <!--T:659-->
""" Here we initialize a Trainer() explicitly with 1 node and 2 GPU.
         To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs
         To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs
         and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes.  
         and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes.  
Line 1,900: Line 1,999:
         which can cause issues due to updating logs too frequently."""
         which can cause issues due to updating logs too frequently."""


     local_scratch = os.environ['SLURM_TMPDIR'] # Get path where local storage is mounted
     <!--T:660-->
local_scratch = os.environ['SLURM_TMPDIR'] # Get path where local storage is mounted


     print(f'Offloading to: {local_scratch}')
     <!--T:661-->
print(f'Offloading to: {local_scratch}')


     trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy=DeepSpeedStrategy(
     <!--T:662-->
trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy=DeepSpeedStrategy(
         stage=3,
         stage=3,
         offload_optimizer=True,
         offload_optimizer=True,
Line 1,914: Line 2,016:
         ), max_epochs = args.max_epochs)
         ), max_epochs = args.max_epochs)


     transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
     <!--T:663-->
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])


     dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
     <!--T:664-->
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)


     train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)
     <!--T:665-->
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)


     trainer.fit(net,train_loader)
     <!--T:666-->
trainer.fit(net,train_loader)




<!--T:667-->
if __name__=='__main__':
if __name__=='__main__':
   main()
   main()


<!--T:668-->
}}
}}


rsnt_translations
56,420

edits

Navigation menu