Weights & Biases

Use Weights & Biases from within Saturn Cloud
Weights & Biases
Try this example in seconds on Saturn Cloud

Overview

This example shows how to use Weights & Biases to monitor the progress of model training on resource with a Dask Cluster in Saturn Cloud. This example will use PyTorch for image classification. It will use the Stanford Dogs dataset, and starting with a pre-trained version of Resnet50 will use transfer learning to make it perform better at dog image identification. In the Weights & Biases (With Dask) example, the same example is used to show how to apply Weights & Biases when running a Dask cluster.

Example code

Imports

import math
import torch
import wandb
import re
import s3fs
from torch import nn, optim
from torchvision import transforms, models
from torch.utils.data.sampler import RandomSampler
from dask_pytorch_ddp import data
import multiprocessing as mp
from fastprogress.fastprogress import master_bar, progress_bar

Set up Weights & Biases

Import the Weights & Biases library, and confirm that you are logged in.

The Start Script in this example uses your Weights & Biases token to log in. The resource will try and read it from an environment variable named WANDB_LOGIN, which you can set up in the Credentials section of Saturn Cloud. If your token is missing the wandb.login() command will prompt you to input it–while this will work in the single GPU case, inputting your token via the UI will not work when using Weights & Biases with Dask. Once you add the token to the Credentials page of Saturn Cloud you’ll need to restart the resource.

wandb.login()

Label Formatting

These utilities ensure the training data labels correspond to the pretrained model’s label expectations.

# Load label dataset
s3 = s3fs.S3FileSystem(anon=True)
with s3.open("s3://saturn-public-data/dogs/imagenet1000_clsidx_to_labels.txt") as f:
    imagenetclasses = [line.strip() for line in f.readlines()]


# Format labels to match pretrained Resnet
def replace_label(dataset_label, model_labels):
    label_string = re.search("n[0-9]+-([^/]+)", dataset_label).group(1)

    for i in model_labels:
        i = str(i).replace("{", "").replace("}", "")
        model_label_str = re.search("""b["'][0-9]+: ["']([^\\/]+)["'],["']""", str(i))
        model_label_idx = re.search("""b["']([0-9]+):""", str(i)).group(1)

        if re.search(str(label_string).replace("_", " "), str(model_label_str).replace("_", " ")):
            return i, model_label_idx
            break

Set Model Specifications

Here you can assign your model hyperparameters, as well as identifying where the training data is housed on S3. All these parameters, as well as some extra elements like Notes and Tags, are tracked by Weights & Biases for you.

model_params = {
    "n_epochs": 6,
    "batch_size": 64,
    "base_lr": 0.0004,
    "downsample_to": 0.5,  # Value represents percent of training data you want to use
    "bucket": "saturn-public-data",
    "prefix": "dogs/Images",
    "pretrained_classes": imagenetclasses,
}
wbargs = {
    **model_params,
    "classes": 120,
    "Notes": "baseline",
    "Tags": ["single", "gpu"],
    "dataset": "StanfordDogs",
    "architecture": "ResNet",
}

Training Function

This function encompasses the training task.

  • Load model
  • Initialize Weights & Biases run
  • Set up DataLoader to iterate over training data
  • Perform training tasks
  • Write model performance data to Weights & Biases
def simple_train_single(
    bucket, prefix, batch_size, downsample_to, n_epochs, base_lr, pretrained_classes
):

    # --------- Format params --------- #
    device = torch.device("cuda")
    net = models.resnet50(pretrained=True)  # True means we start with the imagenet version
    model = net.to(device)

    # --------- Start wandb --------- #
    wandb.init(config=wbargs, project="wandb_saturncloud_demo")
    wandb.watch(model)

    # --------- Set up eval --------- #
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.AdamW(model.parameters(), lr=base_lr, eps=1e-06)

    # --------- Retrieve data for training --------- #
    transform = transforms.Compose(
        [transforms.Resize(256), transforms.CenterCrop(250), transforms.ToTensor()]
    )

    # Because we want to load our images directly and lazily from S3,
    # we use a custom Dataset class called S3ImageFolder.
    whole_dataset = data.S3ImageFolder(bucket, prefix, transform=transform, anon=True)

    # Format target labels
    new_class_to_idx = {
        x: int(replace_label(x, pretrained_classes)[1]) for x in whole_dataset.classes
    }
    whole_dataset.class_to_idx = new_class_to_idx

    # ------ Create dataloader ------- #
    train_loader = torch.utils.data.DataLoader(
        whole_dataset,
        sampler=RandomSampler(
            whole_dataset,
            replacement=True,
            num_samples=math.floor(len(whole_dataset) * downsample_to),
        ),
        batch_size=batch_size,
        num_workers=4,
        multiprocessing_context=mp.get_context("fork"),
    )

    # Using the OneCycleLR learning rate schedule
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=base_lr, steps_per_epoch=len(train_loader), epochs=n_epochs
    )

    # ------ Prepare wandb Table for predictions ------- #
    columns = ["image", "label", "prediction", "score"]
    preds_table = wandb.Table(columns=columns)

    # --------- Start Training ------- #
    mb = master_bar(range(n_epochs))
    for epoch in mb:
        count = 0
        model.train()

        for inputs, labels in progress_bar(train_loader, parent=mb):
            # zero the parameter gradients
            optimizer.zero_grad()

            inputs, labels = inputs.to(device), labels.to(device)

            # Run model iteration
            outputs = model(inputs)

            # Format results
            pred_idx, preds = torch.max(outputs, 1)
            perct = [
                torch.nn.functional.softmax(el, dim=0)[i].item() for i, el in zip(preds, outputs)
            ]

            loss = criterion(outputs, labels)
            correct = (preds == labels).sum().item()

            loss.backward()
            optimizer.step()
            scheduler.step()

            # Log your metrics to wandb
            logs = {
                "train/train_loss": loss.item(),
                "train/learning_rate": scheduler.get_last_lr()[0],
                "train/correct": correct,
                "train/epoch": epoch + count / len(train_loader),
                "train/count": count,
            }

            # Occasionally some images to ensure the image data looks correct
            if count % 25 == 0:
                logs["examples/example_images"] = wandb.Image(inputs[:5], caption=f"Step: {count}")

            # Log some predictions to wandb during final epoch for analysis
            if epoch == max(range(n_epochs)) and count % 4 == 0:
                for i in range(len(labels)):
                    preds_table.add_data(wandb.Image(inputs[i]), labels[i], preds[i], perct[i])

            # Log metrics to wandb
            wandb.log(logs)

            count += 1

    # Upload your predictions table for analysis
    predictions_artifact = wandb.Artifact(
        "train_predictions_" + str(wandb.run.id), type="train_predictions"
    )
    predictions_artifact.add(preds_table, "train_predictions")
    wandb.run.log_artifact(predictions_artifact)

    # Close your wandb run
    wandb.run.finish()

Run Model

You can now monitor the model run on Weights & Biases in real time.

simple_train_single(**model_params)

At this point, you can view the Weights & Biases dashboard to see the performance of the model and system resources utilization in real time.

Conclusion

This example showed how straightforward it is to use Weights & Biases while running Saturn Cloud. By only adding your API key to the credentials section of Saturn Cloud you can quickly use Weights & Biases in your resources.

import math
import torch
import wandb
import re
import s3fs
from torch import nn, optim
from torchvision import transforms, models
from torch.utils.data.sampler import RandomSampler
from dask_pytorch_ddp import data
import multiprocessing as mp
from fastprogress.fastprogress import master_bar, progress_bar


wandb.login()


# Load label dataset
s3 = s3fs.S3FileSystem(anon=True)
with s3.open("s3://saturn-public-data/dogs/imagenet1000_clsidx_to_labels.txt") as f:
    imagenetclasses = [line.strip() for line in f.readlines()]


# Format labels to match pretrained Resnet
def replace_label(dataset_label, model_labels):
    label_string = re.search("n[0-9]+-([^/]+)", dataset_label).group(1)

    for i in model_labels:
        i = str(i).replace("{", "").replace("}", "")
        model_label_str = re.search("""b["'][0-9]+: ["']([^\\/]+)["'],["']""", str(i))
        model_label_idx = re.search("""b["']([0-9]+):""", str(i)).group(1)

        if re.search(str(label_string).replace("_", " "), str(model_label_str).replace("_", " ")):
            return i, model_label_idx
            break


model_params = {
    "n_epochs": 6,
    "batch_size": 64,
    "base_lr": 0.0004,
    "downsample_to": 0.5,  # Value represents percent of training data you want to use
    "bucket": "saturn-public-data",
    "prefix": "dogs/Images",
    "pretrained_classes": imagenetclasses,
}


wbargs = {
    **model_params,
    "classes": 120,
    "Notes": "baseline",
    "Tags": ["single", "gpu"],
    "dataset": "StanfordDogs",
    "architecture": "ResNet",
}


def simple_train_single(
    bucket, prefix, batch_size, downsample_to, n_epochs, base_lr, pretrained_classes
):

    # --------- Format params --------- #
    device = torch.device("cuda")
    net = models.resnet50(pretrained=True)  # True means we start with the imagenet version
    model = net.to(device)

    # --------- Start wandb --------- #
    wandb.init(config=wbargs, project="wandb_saturncloud_demo")
    wandb.watch(model)

    # --------- Set up eval --------- #
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.AdamW(model.parameters(), lr=base_lr, eps=1e-06)

    # --------- Retrieve data for training --------- #
    transform = transforms.Compose(
        [transforms.Resize(256), transforms.CenterCrop(250), transforms.ToTensor()]
    )

    # Because we want to load our images directly and lazily from S3,
    # we use a custom Dataset class called S3ImageFolder.
    whole_dataset = data.S3ImageFolder(bucket, prefix, transform=transform, anon=True)

    # Format target labels
    new_class_to_idx = {
        x: int(replace_label(x, pretrained_classes)[1]) for x in whole_dataset.classes
    }
    whole_dataset.class_to_idx = new_class_to_idx

    # ------ Create dataloader ------- #
    train_loader = torch.utils.data.DataLoader(
        whole_dataset,
        sampler=RandomSampler(
            whole_dataset,
            replacement=True,
            num_samples=math.floor(len(whole_dataset) * downsample_to),
        ),
        batch_size=batch_size,
        num_workers=4,
        multiprocessing_context=mp.get_context("fork"),
    )

    # Using the OneCycleLR learning rate schedule
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=base_lr, steps_per_epoch=len(train_loader), epochs=n_epochs
    )

    # ------ Prepare wandb Table for predictions ------- #
    columns = ["image", "label", "prediction", "score"]
    preds_table = wandb.Table(columns=columns)

    # --------- Start Training ------- #
    mb = master_bar(range(n_epochs))
    for epoch in mb:
        count = 0
        model.train()

        for inputs, labels in progress_bar(train_loader, parent=mb):
            # zero the parameter gradients
            optimizer.zero_grad()

            inputs, labels = inputs.to(device), labels.to(device)

            # Run model iteration
            outputs = model(inputs)

            # Format results
            pred_idx, preds = torch.max(outputs, 1)
            perct = [
                torch.nn.functional.softmax(el, dim=0)[i].item() for i, el in zip(preds, outputs)
            ]

            loss = criterion(outputs, labels)
            correct = (preds == labels).sum().item()

            loss.backward()
            optimizer.step()
            scheduler.step()

            # Log your metrics to wandb
            logs = {
                "train/train_loss": loss.item(),
                "train/learning_rate": scheduler.get_last_lr()[0],
                "train/correct": correct,
                "train/epoch": epoch + count / len(train_loader),
                "train/count": count,
            }

            # Occasionally some images to ensure the image data looks correct
            if count % 25 == 0:
                logs["examples/example_images"] = wandb.Image(inputs[:5], caption=f"Step: {count}")

            # Log some predictions to wandb during final epoch for analysis
            if epoch == max(range(n_epochs)) and count % 4 == 0:
                for i in range(len(labels)):
                    preds_table.add_data(wandb.Image(inputs[i]), labels[i], preds[i], perct[i])

            # Log metrics to wandb
            wandb.log(logs)

            count += 1

    # Upload your predictions table for analysis
    predictions_artifact = wandb.Artifact(
        "train_predictions_" + str(wandb.run.id), type="train_predictions"
    )
    predictions_artifact.add(preds_table, "train_predictions")
    wandb.run.log_artifact(predictions_artifact)

    # Close your wandb run
    wandb.run.finish()


simple_train_single(**model_params)