rsnt_translations
56,420
edits
No edit summary |
(Marked this version for translation) |
||
Line 1: | Line 1: | ||
<languages /> | <languages /> | ||
<translate> | <translate> | ||
<!--T:1--> | |||
[https://wandb.ai Weights & Biases (wandb)] is a "meta machine learning platform" designed to help AI practitioners and teams build reliable machine learning models for real-world applications by streamlining the machine learning model lifecycle. By using wandb, users can track, compare, explain and reproduce their machine learning experiments. | [https://wandb.ai Weights & Biases (wandb)] is a "meta machine learning platform" designed to help AI practitioners and teams build reliable machine learning models for real-world applications by streamlining the machine learning model lifecycle. By using wandb, users can track, compare, explain and reproduce their machine learning experiments. | ||
== Using wandb on Compute Canada clusters == | == Using wandb on Compute Canada clusters == <!--T:2--> | ||
=== Availability === | === Availability === <!--T:3--> | ||
<!--T:4--> | |||
Since it requires an internet connection, wandb has restricted availability on compute nodes, depending on the cluster: | Since it requires an internet connection, wandb has restricted availability on compute nodes, depending on the cluster: | ||
<!--T:5--> | |||
{| class="wikitable" | {| class="wikitable" | ||
|- | |- | ||
Line 22: | Line 25: | ||
|} | |} | ||
=== Example === | === Example === <!--T:6--> | ||
<!--T:7--> | |||
The following is an example of how to use wandb to track experiments on Béluga. To reproduce this on Cedar, it is not necessary to load the module <tt>httpproxy</tt>. | The following is an example of how to use wandb to track experiments on Béluga. To reproduce this on Cedar, it is not necessary to load the module <tt>httpproxy</tt>. | ||
<!--T:8--> | |||
{{File | {{File | ||
|name=wandb-test.sh | |name=wandb-test.sh | ||
Line 37: | Line 42: | ||
<!--T:9--> | |||
module load python/3.6 httpproxy | module load python/3.6 httpproxy | ||
virtualenv --no-download $SLURM_TMPDIR/env | virtualenv --no-download $SLURM_TMPDIR/env | ||
Line 43: | Line 49: | ||
<!--T:10--> | |||
### Save your wandb API key in your .bash_profile or replace $API_KEY with your actual API key: | ### Save your wandb API key in your .bash_profile or replace $API_KEY with your actual API key: | ||
<!--T:11--> | |||
wandb login $API_KEY | wandb login $API_KEY | ||
<!--T:12--> | |||
python wandb-test.py | python wandb-test.py | ||
}} | }} | ||
<!--T:13--> | |||
The script wandb-test.py uses the <tt>watch()</tt> method to log default metrics to Weights & Biases. See their [https://docs.wandb.ai full documentation] for more options. | The script wandb-test.py uses the <tt>watch()</tt> method to log default metrics to Weights & Biases. See their [https://docs.wandb.ai full documentation] for more options. | ||
<!--T:14--> | |||
{{File | {{File | ||
|name=wandb-test.py | |name=wandb-test.py | ||
Line 62: | Line 73: | ||
import torch.backends.cudnn as cudnn | import torch.backends.cudnn as cudnn | ||
<!--T:15--> | |||
import torchvision | import torchvision | ||
import torchvision.transforms as transforms | import torchvision.transforms as transforms | ||
Line 67: | Line 79: | ||
from torch.utils.data import DataLoader | from torch.utils.data import DataLoader | ||
<!--T:16--> | |||
import argparse | import argparse | ||
<!--T:17--> | |||
import wandb | import wandb | ||
<!--T:18--> | |||
parser = argparse.ArgumentParser(description='cifar10 classification models, wandb test') | parser = argparse.ArgumentParser(description='cifar10 classification models, wandb test') | ||
parser.add_argument('--lr', default=0.1, help='') | parser.add_argument('--lr', default=0.1, help='') | ||
Line 78: | Line 93: | ||
parser.add_argument('--num_workers', type=int, default=0, help='') | parser.add_argument('--num_workers', type=int, default=0, help='') | ||
<!--T:19--> | |||
def main(): | def main(): | ||
args = parser.parse_args() | args = parser.parse_args() | ||
print("Starting Wandb...") | <!--T:20--> | ||
print("Starting Wandb...") | |||
wandb.init(project="wandb-pytorch-test", config=args) | <!--T:21--> | ||
wandb.init(project="wandb-pytorch-test", config=args) | |||
class Net(nn.Module): | <!--T:22--> | ||
class Net(nn.Module): | |||
def __init__(self): | <!--T:23--> | ||
def __init__(self): | |||
super(Net, self).__init__() | super(Net, self).__init__() | ||
self.conv1 = nn.Conv2d(3, 6, 5) | <!--T:24--> | ||
self.conv1 = nn.Conv2d(3, 6, 5) | |||
self.pool = nn.MaxPool2d(2, 2) | self.pool = nn.MaxPool2d(2, 2) | ||
self.conv2 = nn.Conv2d(6, 16, 5) | self.conv2 = nn.Conv2d(6, 16, 5) | ||
Line 98: | Line 119: | ||
self.fc3 = nn.Linear(84, 10) | self.fc3 = nn.Linear(84, 10) | ||
def forward(self, x): | <!--T:25--> | ||
def forward(self, x): | |||
x = self.pool(F.relu(self.conv1(x))) | x = self.pool(F.relu(self.conv1(x))) | ||
x = self.pool(F.relu(self.conv2(x))) | x = self.pool(F.relu(self.conv2(x))) | ||
Line 107: | Line 129: | ||
return x | return x | ||
net = Net() | <!--T:26--> | ||
net = Net() | |||
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | <!--T:27--> | ||
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | |||
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) | <!--T:28--> | ||
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) | |||
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) | <!--T:29--> | ||
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) | |||
criterion = nn.CrossEntropyLoss() | <!--T:30--> | ||
criterion = nn.CrossEntropyLoss() | |||
optimizer = optim.SGD(net.parameters(), lr=args.lr) | optimizer = optim.SGD(net.parameters(), lr=args.lr) | ||
wandb.watch(net) | <!--T:31--> | ||
wandb.watch(net) | |||
for epoch in range(args.max_epochs): | <!--T:32--> | ||
for epoch in range(args.max_epochs): | |||
train(epoch, net, criterion, optimizer, train_loader) | <!--T:33--> | ||
train(epoch, net, criterion, optimizer, train_loader) | |||
<!--T:34--> | |||
def train(epoch, net, criterion, optimizer, train_loader): | def train(epoch, net, criterion, optimizer, train_loader): | ||
for batch_idx, (inputs, targets) in enumerate(train_loader): | <!--T:35--> | ||
for batch_idx, (inputs, targets) in enumerate(train_loader): | |||
outputs = net(inputs) | <!--T:36--> | ||
outputs = net(inputs) | |||
loss = criterion(outputs, targets) | loss = criterion(outputs, targets) | ||
optimizer.zero_grad() | <!--T:37--> | ||
optimizer.zero_grad() | |||
loss.backward() | loss.backward() | ||
optimizer.step() | optimizer.step() | ||
<!--T:38--> | |||
if __name__=='__main__': | if __name__=='__main__': | ||
main() | main() | ||
<!--T:39--> | |||
}} | }} | ||
</translate> | </translate> |