rsnt_translations
56,420
edits
No edit summary |
(Marked this version for translation) |
||
Line 1,491: | Line 1,491: | ||
import torch.nn.functional as F | import torch.nn.functional as F | ||
<!--T:561--> | |||
import pytorch_lightning as pl | import pytorch_lightning as pl | ||
<!--T:562--> | |||
import torchvision | import torchvision | ||
import torchvision.transforms as transforms | import torchvision.transforms as transforms | ||
Line 1,498: | Line 1,500: | ||
from torch.utils.data import DataLoader | from torch.utils.data import DataLoader | ||
<!--T:563--> | |||
from deepspeed.ops.adam import FusedAdam | from deepspeed.ops.adam import FusedAdam | ||
from pytorch_lightning.strategies import DeepSpeedStrategy | from pytorch_lightning.strategies import DeepSpeedStrategy | ||
<!--T:564--> | |||
import argparse | import argparse | ||
<!--T:565--> | |||
parser = argparse.ArgumentParser(description='cifar10 classification models deep seed stage 3 test') | parser = argparse.ArgumentParser(description='cifar10 classification models deep seed stage 3 test') | ||
parser.add_argument('--lr', default=0.1, help='') | parser.add_argument('--lr', default=0.1, help='') | ||
Line 1,510: | Line 1,515: | ||
<!--T:566--> | |||
def main(): | def main(): | ||
print("Starting...") | print("Starting...") | ||
args = parser.parse_args() | <!--T:567--> | ||
args = parser.parse_args() | |||
class ConvPart(nn.Module): | <!--T:568--> | ||
class ConvPart(nn.Module): | |||
def __init__(self): | <!--T:569--> | ||
def __init__(self): | |||
super(ConvPart, self).__init__() | super(ConvPart, self).__init__() | ||
self.conv1 = nn.Conv2d(3, 6, 5) | <!--T:570--> | ||
self.conv1 = nn.Conv2d(3, 6, 5) | |||
self.pool = nn.MaxPool2d(2, 2) | self.pool = nn.MaxPool2d(2, 2) | ||
self.conv2 = nn.Conv2d(6, 16, 5) | self.conv2 = nn.Conv2d(6, 16, 5) | ||
self.relu = nn.ReLU() | self.relu = nn.ReLU() | ||
def forward(self, x): | <!--T:571--> | ||
def forward(self, x): | |||
x = self.pool(self.relu(self.conv1(x))) | x = self.pool(self.relu(self.conv1(x))) | ||
x = self.pool(self.relu(self.conv2(x))) | x = self.pool(self.relu(self.conv2(x))) | ||
x = x.view(-1, 16 * 5 * 5) | x = x.view(-1, 16 * 5 * 5) | ||
return x | <!--T:572--> | ||
return x | |||
# Dense feedforward part of the model | <!--T:573--> | ||
# Dense feedforward part of the model | |||
class MLPPart(nn.Module): | class MLPPart(nn.Module): | ||
def __init__(self): | <!--T:574--> | ||
def __init__(self): | |||
super(MLPPart, self).__init__() | super(MLPPart, self).__init__() | ||
self.fc1 = nn.Linear(16 * 5 * 5, 120) | <!--T:575--> | ||
self.fc1 = nn.Linear(16 * 5 * 5, 120) | |||
self.fc2 = nn.Linear(120, 84) | self.fc2 = nn.Linear(120, 84) | ||
self.fc3 = nn.Linear(84, 10) | self.fc3 = nn.Linear(84, 10) | ||
self.relu = nn.ReLU() | self.relu = nn.ReLU() | ||
def forward(self, x): | <!--T:576--> | ||
def forward(self, x): | |||
x = self.relu(self.fc1(x)) | x = self.relu(self.fc1(x)) | ||
x = self.relu(self.fc2(x)) | x = self.relu(self.fc2(x)) | ||
x = self.fc3(x) | x = self.fc3(x) | ||
return x | <!--T:577--> | ||
return x | |||
class Net(pl.LightningModule): | <!--T:578--> | ||
class Net(pl.LightningModule): | |||
def __init__(self): | <!--T:579--> | ||
def __init__(self): | |||
super(Net, self).__init__() | super(Net, self).__init__() | ||
self.conv_part = ConvPart() | <!--T:580--> | ||
self.conv_part = ConvPart() | |||
self.mlp_part = MLPPart() | self.mlp_part = MLPPart() | ||
def configure_sharded_model(self): | <!--T:581--> | ||
def configure_sharded_model(self): | |||
self.block = nn.Sequential(self.conv_part, self.mlp_part) | <!--T:582--> | ||
self.block = nn.Sequential(self.conv_part, self.mlp_part) | |||
def forward(self, x): | <!--T:583--> | ||
def forward(self, x): | |||
x = self.block(x) | x = self.block(x) | ||
return x | <!--T:584--> | ||
return x | |||
def training_step(self, batch, batch_idx): | <!--T:585--> | ||
def training_step(self, batch, batch_idx): | |||
x, y = batch | x, y = batch | ||
y_hat = self(x) | y_hat = self(x) | ||
Line 1,573: | Line 1,598: | ||
return loss | return loss | ||
def configure_optimizers(self): | <!--T:586--> | ||
def configure_optimizers(self): | |||
return FusedAdam(self.parameters()) | return FusedAdam(self.parameters()) | ||
net = Net() | <!--T:587--> | ||
net = Net() | |||
""" Here we initialize a Trainer() explicitly with 1 node and 2 GPU. | <!--T:588--> | ||
""" Here we initialize a Trainer() explicitly with 1 node and 2 GPU. | |||
To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs | To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs | ||
and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes. | and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes. | ||
Line 1,584: | Line 1,612: | ||
which can cause issues due to updating logs too frequently.""" | which can cause issues due to updating logs too frequently.""" | ||
trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy="deepspeed_stage_3", max_epochs = args.max_epochs) | <!--T:589--> | ||
trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy="deepspeed_stage_3", max_epochs = args.max_epochs) | |||
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | <!--T:590--> | ||
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | |||
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) | <!--T:591--> | ||
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) | |||
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) | <!--T:592--> | ||
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) | |||
trainer.fit(net,train_loader) | <!--T:593--> | ||
trainer.fit(net,train_loader) | |||
<!--T:594--> | |||
if __name__=='__main__': | if __name__=='__main__': | ||
main() | main() | ||
<!--T:595--> | |||
}} | }} | ||
Line 1,643: | Line 1,678: | ||
|contents= | |contents= | ||
<!--T:596--> | |||
import torch | import torch | ||
from torch import nn | from torch import nn | ||
import torch.nn.functional as F | import torch.nn.functional as F | ||
<!--T:597--> | |||
import pytorch_lightning as pl | import pytorch_lightning as pl | ||
<!--T:598--> | |||
import torchvision | import torchvision | ||
import torchvision.transforms as transforms | import torchvision.transforms as transforms | ||
Line 1,654: | Line 1,692: | ||
from torch.utils.data import DataLoader | from torch.utils.data import DataLoader | ||
<!--T:599--> | |||
from deepspeed.ops.adam import DeepSpeedCPUAdam | from deepspeed.ops.adam import DeepSpeedCPUAdam | ||
from pytorch_lightning.strategies import DeepSpeedStrategy | from pytorch_lightning.strategies import DeepSpeedStrategy | ||
<!--T:600--> | |||
import argparse | import argparse | ||
<!--T:601--> | |||
parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed offload to cpu test') | parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed offload to cpu test') | ||
parser.add_argument('--lr', default=0.1, help='') | parser.add_argument('--lr', default=0.1, help='') | ||
Line 1,666: | Line 1,707: | ||
<!--T:602--> | |||
def main(): | def main(): | ||
print("Starting...") | print("Starting...") | ||
args = parser.parse_args() | <!--T:603--> | ||
args = parser.parse_args() | |||
class ConvPart(nn.Module): | <!--T:604--> | ||
class ConvPart(nn.Module): | |||
def __init__(self): | <!--T:605--> | ||
def __init__(self): | |||
super(ConvPart, self).__init__() | super(ConvPart, self).__init__() | ||
self.conv1 = nn.Conv2d(3, 6, 5) | <!--T:606--> | ||
self.conv1 = nn.Conv2d(3, 6, 5) | |||
self.pool = nn.MaxPool2d(2, 2) | self.pool = nn.MaxPool2d(2, 2) | ||
self.conv2 = nn.Conv2d(6, 16, 5) | self.conv2 = nn.Conv2d(6, 16, 5) | ||
self.relu = nn.ReLU() | self.relu = nn.ReLU() | ||
def forward(self, x): | <!--T:607--> | ||
def forward(self, x): | |||
x = self.pool(self.relu(self.conv1(x))) | x = self.pool(self.relu(self.conv1(x))) | ||
x = self.pool(self.relu(self.conv2(x))) | x = self.pool(self.relu(self.conv2(x))) | ||
x = x.view(-1, 16 * 5 * 5) | x = x.view(-1, 16 * 5 * 5) | ||
return x | <!--T:608--> | ||
return x | |||
# Dense feedforward part of the model | <!--T:609--> | ||
# Dense feedforward part of the model | |||
class MLPPart(nn.Module): | class MLPPart(nn.Module): | ||
def __init__(self): | <!--T:610--> | ||
def __init__(self): | |||
super(MLPPart, self).__init__() | super(MLPPart, self).__init__() | ||
self.fc1 = nn.Linear(16 * 5 * 5, 120) | <!--T:611--> | ||
self.fc1 = nn.Linear(16 * 5 * 5, 120) | |||
self.fc2 = nn.Linear(120, 84) | self.fc2 = nn.Linear(120, 84) | ||
self.fc3 = nn.Linear(84, 10) | self.fc3 = nn.Linear(84, 10) | ||
self.relu = nn.ReLU() | self.relu = nn.ReLU() | ||
def forward(self, x): | <!--T:612--> | ||
def forward(self, x): | |||
x = self.relu(self.fc1(x)) | x = self.relu(self.fc1(x)) | ||
x = self.relu(self.fc2(x)) | x = self.relu(self.fc2(x)) | ||
x = self.fc3(x) | x = self.fc3(x) | ||
return x | <!--T:613--> | ||
return x | |||
class Net(pl.LightningModule): | <!--T:614--> | ||
class Net(pl.LightningModule): | |||
def __init__(self): | <!--T:615--> | ||
def __init__(self): | |||
super(Net, self).__init__() | super(Net, self).__init__() | ||
self.conv_part = ConvPart() | <!--T:616--> | ||
self.conv_part = ConvPart() | |||
self.mlp_part = MLPPart() | self.mlp_part = MLPPart() | ||
def configure_sharded_model(self): | <!--T:617--> | ||
def configure_sharded_model(self): | |||
self.block = nn.Sequential(self.conv_part, self.mlp_part) | <!--T:618--> | ||
self.block = nn.Sequential(self.conv_part, self.mlp_part) | |||
def forward(self, x): | <!--T:619--> | ||
def forward(self, x): | |||
x = self.block(x) | x = self.block(x) | ||
return x | <!--T:620--> | ||
return x | |||
def training_step(self, batch, batch_idx): | <!--T:621--> | ||
def training_step(self, batch, batch_idx): | |||
x, y = batch | x, y = batch | ||
y_hat = self(x) | y_hat = self(x) | ||
Line 1,729: | Line 1,790: | ||
return loss | return loss | ||
def configure_optimizers(self): | <!--T:622--> | ||
def configure_optimizers(self): | |||
return DeepSpeedCPUAdam(self.parameters()) | return DeepSpeedCPUAdam(self.parameters()) | ||
net = Net() | <!--T:623--> | ||
net = Net() | |||
""" Here we initialize a Trainer() explicitly with 1 node and 2 GPU. | <!--T:624--> | ||
""" Here we initialize a Trainer() explicitly with 1 node and 2 GPU. | |||
To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs | To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs | ||
and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes. | and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes. | ||
Line 1,740: | Line 1,804: | ||
which can cause issues due to updating logs too frequently.""" | which can cause issues due to updating logs too frequently.""" | ||
trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy=DeepSpeedStrategy( | <!--T:625--> | ||
trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy=DeepSpeedStrategy( | |||
stage=3, | stage=3, | ||
offload_optimizer=True, | offload_optimizer=True, | ||
Line 1,746: | Line 1,811: | ||
), max_epochs = args.max_epochs) | ), max_epochs = args.max_epochs) | ||
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | <!--T:626--> | ||
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | |||
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) | <!--T:627--> | ||
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) | |||
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) | <!--T:628--> | ||
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) | |||
trainer.fit(net,train_loader) | <!--T:629--> | ||
trainer.fit(net,train_loader) | |||
<!--T:630--> | |||
if __name__=='__main__': | if __name__=='__main__': | ||
main() | main() | ||
Line 1,803: | Line 1,873: | ||
import os | import os | ||
<!--T:631--> | |||
import torch | import torch | ||
from torch import nn | from torch import nn | ||
import torch.nn.functional as F | import torch.nn.functional as F | ||
<!--T:632--> | |||
import pytorch_lightning as pl | import pytorch_lightning as pl | ||
<!--T:633--> | |||
import torchvision | import torchvision | ||
import torchvision.transforms as transforms | import torchvision.transforms as transforms | ||
Line 1,814: | Line 1,887: | ||
from torch.utils.data import DataLoader | from torch.utils.data import DataLoader | ||
<!--T:634--> | |||
from deepspeed.ops.adam import DeepSpeedCPUAdam | from deepspeed.ops.adam import DeepSpeedCPUAdam | ||
from pytorch_lightning.strategies import DeepSpeedStrategy | from pytorch_lightning.strategies import DeepSpeedStrategy | ||
<!--T:635--> | |||
import argparse | import argparse | ||
<!--T:636--> | |||
parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed offload to nvme test') | parser = argparse.ArgumentParser(description='cifar10 classification models, deepspeed offload to nvme test') | ||
parser.add_argument('--lr', default=0.1, help='') | parser.add_argument('--lr', default=0.1, help='') | ||
Line 1,826: | Line 1,902: | ||
<!--T:637--> | |||
def main(): | def main(): | ||
print("Starting...") | print("Starting...") | ||
args = parser.parse_args() | <!--T:638--> | ||
args = parser.parse_args() | |||
class ConvPart(nn.Module): | <!--T:639--> | ||
class ConvPart(nn.Module): | |||
def __init__(self): | <!--T:640--> | ||
def __init__(self): | |||
super(ConvPart, self).__init__() | super(ConvPart, self).__init__() | ||
self.conv1 = nn.Conv2d(3, 6, 5) | <!--T:641--> | ||
self.conv1 = nn.Conv2d(3, 6, 5) | |||
self.pool = nn.MaxPool2d(2, 2) | self.pool = nn.MaxPool2d(2, 2) | ||
self.conv2 = nn.Conv2d(6, 16, 5) | self.conv2 = nn.Conv2d(6, 16, 5) | ||
self.relu = nn.ReLU() | self.relu = nn.ReLU() | ||
def forward(self, x): | <!--T:642--> | ||
def forward(self, x): | |||
x = self.pool(self.relu(self.conv1(x))) | x = self.pool(self.relu(self.conv1(x))) | ||
x = self.pool(self.relu(self.conv2(x))) | x = self.pool(self.relu(self.conv2(x))) | ||
x = x.view(-1, 16 * 5 * 5) | x = x.view(-1, 16 * 5 * 5) | ||
return x | <!--T:643--> | ||
return x | |||
# Dense feedforward part of the model | <!--T:644--> | ||
# Dense feedforward part of the model | |||
class MLPPart(nn.Module): | class MLPPart(nn.Module): | ||
def __init__(self): | <!--T:645--> | ||
def __init__(self): | |||
super(MLPPart, self).__init__() | super(MLPPart, self).__init__() | ||
self.fc1 = nn.Linear(16 * 5 * 5, 120) | <!--T:646--> | ||
self.fc1 = nn.Linear(16 * 5 * 5, 120) | |||
self.fc2 = nn.Linear(120, 84) | self.fc2 = nn.Linear(120, 84) | ||
self.fc3 = nn.Linear(84, 10) | self.fc3 = nn.Linear(84, 10) | ||
self.relu = nn.ReLU() | self.relu = nn.ReLU() | ||
def forward(self, x): | <!--T:647--> | ||
def forward(self, x): | |||
x = self.relu(self.fc1(x)) | x = self.relu(self.fc1(x)) | ||
x = self.relu(self.fc2(x)) | x = self.relu(self.fc2(x)) | ||
x = self.fc3(x) | x = self.fc3(x) | ||
return x | <!--T:648--> | ||
return x | |||
class Net(pl.LightningModule): | <!--T:649--> | ||
class Net(pl.LightningModule): | |||
def __init__(self): | <!--T:650--> | ||
def __init__(self): | |||
super(Net, self).__init__() | super(Net, self).__init__() | ||
self.conv_part = ConvPart() | <!--T:651--> | ||
self.conv_part = ConvPart() | |||
self.mlp_part = MLPPart() | self.mlp_part = MLPPart() | ||
def configure_sharded_model(self): | <!--T:652--> | ||
def configure_sharded_model(self): | |||
self.block = nn.Sequential(self.conv_part, self.mlp_part) | <!--T:653--> | ||
self.block = nn.Sequential(self.conv_part, self.mlp_part) | |||
def forward(self, x): | <!--T:654--> | ||
def forward(self, x): | |||
x = self.block(x) | x = self.block(x) | ||
return x | <!--T:655--> | ||
return x | |||
def training_step(self, batch, batch_idx): | <!--T:656--> | ||
def training_step(self, batch, batch_idx): | |||
x, y = batch | x, y = batch | ||
y_hat = self(x) | y_hat = self(x) | ||
Line 1,889: | Line 1,985: | ||
return loss | return loss | ||
def configure_optimizers(self): | <!--T:657--> | ||
def configure_optimizers(self): | |||
return DeepSpeedCPUAdam(self.parameters()) | return DeepSpeedCPUAdam(self.parameters()) | ||
net = Net() | <!--T:658--> | ||
net = Net() | |||
""" Here we initialize a Trainer() explicitly with 1 node and 2 GPU. | <!--T:659--> | ||
""" Here we initialize a Trainer() explicitly with 1 node and 2 GPU. | |||
To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs | To make this script more generic, you can use torch.cuda.device_count() to set the number of GPUs | ||
and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes. | and you can use int(os.environ.get("SLURM_JOB_NUM_NODES")) to set the number of nodes. | ||
Line 1,900: | Line 1,999: | ||
which can cause issues due to updating logs too frequently.""" | which can cause issues due to updating logs too frequently.""" | ||
local_scratch = os.environ['SLURM_TMPDIR'] # Get path where local storage is mounted | <!--T:660--> | ||
local_scratch = os.environ['SLURM_TMPDIR'] # Get path where local storage is mounted | |||
print(f'Offloading to: {local_scratch}') | <!--T:661--> | ||
print(f'Offloading to: {local_scratch}') | |||
trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy=DeepSpeedStrategy( | <!--T:662--> | ||
trainer = pl.Trainer(accelerator="gpu", devices=2, num_nodes=1, strategy=DeepSpeedStrategy( | |||
stage=3, | stage=3, | ||
offload_optimizer=True, | offload_optimizer=True, | ||
Line 1,914: | Line 2,016: | ||
), max_epochs = args.max_epochs) | ), max_epochs = args.max_epochs) | ||
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | <!--T:663--> | ||
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | |||
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) | <!--T:664--> | ||
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) | |||
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) | <!--T:665--> | ||
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) | |||
trainer.fit(net,train_loader) | <!--T:666--> | ||
trainer.fit(net,train_loader) | |||
<!--T:667--> | |||
if __name__=='__main__': | if __name__=='__main__': | ||
main() | main() | ||
<!--T:668--> | |||
}} | }} | ||