Weights & Biases (wandb): Difference between revisions
No edit summary |
(Marked this version for translation) |
||
Line 1: | Line 1: | ||
<languages /> | <languages /> | ||
<translate> | <translate> | ||
<!--T:1--> | |||
[https://wandb.ai Weights & Biases (wandb)] is a "meta machine learning platform" designed to help AI practitioners and teams build reliable machine learning models for real-world applications by streamlining the machine learning model lifecycle. By using wandb, users can track, compare, explain and reproduce their machine learning experiments. | [https://wandb.ai Weights & Biases (wandb)] is a "meta machine learning platform" designed to help AI practitioners and teams build reliable machine learning models for real-world applications by streamlining the machine learning model lifecycle. By using wandb, users can track, compare, explain and reproduce their machine learning experiments. | ||
== Using wandb on Compute Canada clusters == | == Using wandb on Compute Canada clusters == <!--T:2--> | ||
=== Availability === | === Availability === <!--T:3--> | ||
<!--T:4--> | |||
Since it requires an internet connection, wandb has restricted availability on compute nodes, depending on the cluster: | Since it requires an internet connection, wandb has restricted availability on compute nodes, depending on the cluster: | ||
<!--T:5--> | |||
{| class="wikitable" | {| class="wikitable" | ||
|- | |- | ||
Line 22: | Line 25: | ||
|} | |} | ||
=== Example === | === Example === <!--T:6--> | ||
<!--T:7--> | |||
The following is an example of how to use wandb to track experiments on Béluga. To reproduce this on Cedar, it is not necessary to load the module <tt>httpproxy</tt>. | The following is an example of how to use wandb to track experiments on Béluga. To reproduce this on Cedar, it is not necessary to load the module <tt>httpproxy</tt>. | ||
<!--T:8--> | |||
{{File | {{File | ||
|name=wandb-test.sh | |name=wandb-test.sh | ||
Line 37: | Line 42: | ||
<!--T:9--> | |||
module load python/3.6 httpproxy | module load python/3.6 httpproxy | ||
virtualenv --no-download $SLURM_TMPDIR/env | virtualenv --no-download $SLURM_TMPDIR/env | ||
Line 43: | Line 49: | ||
<!--T:10--> | |||
### Save your wandb API key in your .bash_profile or replace $API_KEY with your actual API key: | ### Save your wandb API key in your .bash_profile or replace $API_KEY with your actual API key: | ||
<!--T:11--> | |||
wandb login $API_KEY | wandb login $API_KEY | ||
<!--T:12--> | |||
python wandb-test.py | python wandb-test.py | ||
}} | }} | ||
<!--T:13--> | |||
The script wandb-test.py uses the <tt>watch()</tt> method to log default metrics to Weights & Biases. See their [https://docs.wandb.ai full documentation] for more options. | The script wandb-test.py uses the <tt>watch()</tt> method to log default metrics to Weights & Biases. See their [https://docs.wandb.ai full documentation] for more options. | ||
<!--T:14--> | |||
{{File | {{File | ||
|name=wandb-test.py | |name=wandb-test.py | ||
Line 62: | Line 73: | ||
import torch.backends.cudnn as cudnn | import torch.backends.cudnn as cudnn | ||
<!--T:15--> | |||
import torchvision | import torchvision | ||
import torchvision.transforms as transforms | import torchvision.transforms as transforms | ||
Line 67: | Line 79: | ||
from torch.utils.data import DataLoader | from torch.utils.data import DataLoader | ||
<!--T:16--> | |||
import argparse | import argparse | ||
<!--T:17--> | |||
import wandb | import wandb | ||
<!--T:18--> | |||
parser = argparse.ArgumentParser(description='cifar10 classification models, wandb test') | parser = argparse.ArgumentParser(description='cifar10 classification models, wandb test') | ||
parser.add_argument('--lr', default=0.1, help='') | parser.add_argument('--lr', default=0.1, help='') | ||
Line 78: | Line 93: | ||
parser.add_argument('--num_workers', type=int, default=0, help='') | parser.add_argument('--num_workers', type=int, default=0, help='') | ||
<!--T:19--> | |||
def main(): | def main(): | ||
args = parser.parse_args() | args = parser.parse_args() | ||
print("Starting Wandb...") | <!--T:20--> | ||
print("Starting Wandb...") | |||
wandb.init(project="wandb-pytorch-test", config=args) | <!--T:21--> | ||
wandb.init(project="wandb-pytorch-test", config=args) | |||
class Net(nn.Module): | <!--T:22--> | ||
class Net(nn.Module): | |||
def __init__(self): | <!--T:23--> | ||
def __init__(self): | |||
super(Net, self).__init__() | super(Net, self).__init__() | ||
self.conv1 = nn.Conv2d(3, 6, 5) | <!--T:24--> | ||
self.conv1 = nn.Conv2d(3, 6, 5) | |||
self.pool = nn.MaxPool2d(2, 2) | self.pool = nn.MaxPool2d(2, 2) | ||
self.conv2 = nn.Conv2d(6, 16, 5) | self.conv2 = nn.Conv2d(6, 16, 5) | ||
Line 98: | Line 119: | ||
self.fc3 = nn.Linear(84, 10) | self.fc3 = nn.Linear(84, 10) | ||
def forward(self, x): | <!--T:25--> | ||
def forward(self, x): | |||
x = self.pool(F.relu(self.conv1(x))) | x = self.pool(F.relu(self.conv1(x))) | ||
x = self.pool(F.relu(self.conv2(x))) | x = self.pool(F.relu(self.conv2(x))) | ||
Line 107: | Line 129: | ||
return x | return x | ||
net = Net() | <!--T:26--> | ||
net = Net() | |||
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | <!--T:27--> | ||
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) | |||
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) | <!--T:28--> | ||
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train) | |||
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) | <!--T:29--> | ||
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers) | |||
criterion = nn.CrossEntropyLoss() | <!--T:30--> | ||
criterion = nn.CrossEntropyLoss() | |||
optimizer = optim.SGD(net.parameters(), lr=args.lr) | optimizer = optim.SGD(net.parameters(), lr=args.lr) | ||
wandb.watch(net) | <!--T:31--> | ||
wandb.watch(net) | |||
for epoch in range(args.max_epochs): | <!--T:32--> | ||
for epoch in range(args.max_epochs): | |||
train(epoch, net, criterion, optimizer, train_loader) | <!--T:33--> | ||
train(epoch, net, criterion, optimizer, train_loader) | |||
<!--T:34--> | |||
def train(epoch, net, criterion, optimizer, train_loader): | def train(epoch, net, criterion, optimizer, train_loader): | ||
for batch_idx, (inputs, targets) in enumerate(train_loader): | <!--T:35--> | ||
for batch_idx, (inputs, targets) in enumerate(train_loader): | |||
outputs = net(inputs) | <!--T:36--> | ||
outputs = net(inputs) | |||
loss = criterion(outputs, targets) | loss = criterion(outputs, targets) | ||
optimizer.zero_grad() | <!--T:37--> | ||
optimizer.zero_grad() | |||
loss.backward() | loss.backward() | ||
optimizer.step() | optimizer.step() | ||
<!--T:38--> | |||
if __name__=='__main__': | if __name__=='__main__': | ||
main() | main() | ||
<!--T:39--> | |||
}} | }} | ||
</translate> | </translate> |
Revision as of 18:14, 8 February 2021
Weights & Biases (wandb) is a "meta machine learning platform" designed to help AI practitioners and teams build reliable machine learning models for real-world applications by streamlining the machine learning model lifecycle. By using wandb, users can track, compare, explain and reproduce their machine learning experiments.
Using wandb on Compute Canada clusters
Availability
Since it requires an internet connection, wandb has restricted availability on compute nodes, depending on the cluster:
Cluster | Availability | Note |
---|---|---|
Béluga | Yes ✅ | wandb can be used after loading the httpproxy module: module load httpproxy |
Cedar | Yes ✅ | Internet access is enabled |
Graham | No ❌ | Internet access is disabled on compute nodes |
Example
The following is an example of how to use wandb to track experiments on Béluga. To reproduce this on Cedar, it is not necessary to load the module httpproxy.
#!/bin/bash
#SBATCH --cpus-per-task=1
#SBATCH --mem=2G
#SBATCH --time=0-03:00
#SBATCH --output=%N-%j.out
module load python/3.6 httpproxy
virtualenv --no-download $SLURM_TMPDIR/env
source $SLURM_TMPDIR/env/bin/activate
pip install torchvision wandb --no-index
### Save your wandb API key in your .bash_profile or replace $API_KEY with your actual API key:
wandb login $API_KEY
python wandb-test.py
The script wandb-test.py uses the watch() method to log default metrics to Weights & Biases. See their full documentation for more options.
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
import argparse
import wandb
parser = argparse.ArgumentParser(description='cifar10 classification models, wandb test')
parser.add_argument('--lr', default=0.1, help='')
parser.add_argument('--batch_size', type=int, default=768, help='')
parser.add_argument('--max_epochs', type=int, default=4, help='')
parser.add_argument('--num_workers', type=int, default=0, help='')
def main():
args = parser.parse_args()
print("Starting Wandb...")
wandb.init(project="wandb-pytorch-test", config=args)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * 5 * 5, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 5 * 5)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
net = Net()
transform_train = transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
dataset_train = CIFAR10(root='./data', train=True, download=False, transform=transform_train)
train_loader = DataLoader(dataset_train, batch_size=args.batch_size, num_workers=args.num_workers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=args.lr)
wandb.watch(net)
for epoch in range(args.max_epochs):
train(epoch, net, criterion, optimizer, train_loader)
def train(epoch, net, criterion, optimizer, train_loader):
for batch_idx, (inputs, targets) in enumerate(train_loader):
outputs = net(inputs)
loss = criterion(outputs, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if __name__=='__main__':
main()