Slurm sbatch for a PyTorch script draining node; gres/gpu: count changed for node node002 from 0 to 1

Question

We have a user whose script always drains a node.

Note this error: "gres/gpu: count changed for node node002 from 0 to 1" Could it be misleading? What could cause the node to drain? Here are the contents of the user's SBATCH file. Could the piping having an effect here? Another thing that I just noticed while typing this up is perhaps an attempt to use a mix of library versions. So he has a module load cuda10.0 but then a module load pytorch-py36-cuda10.1-gcc/1.3.1 module load ml-pythondeps-py36-cuda10.1-gcc/3.0.0 python3.6

#!/bin/sh
#SBATCH -N 1
#SBATCH -n 1
#SBATCH --mail-type=ALL
#SBATCH --gres=gpu:1
#SBATCH --job-name=$1sequentialBlur_squeezenet_training_imagewoof_crossval
module purge
module load gcc5 cuda10.0
module load openmpi/cuda/64
module load pytorch-py36-cuda10.1-gcc/1.3.1
module load ml-pythondeps-py36-cuda10.1-gcc/3.0.0
python3.6 SequentialBlur_untrained.py squeezenet 100 imagewoof $1 | tee squeeze_100_imwoof_seq_longtrain_cv_$1.txt
/u/run_seq_blur2.py

Here are the script contents:

# Banks 1978 paper:
# 1 month:  2.4 cyc/deg
# 2 month:  2.8 cyc/deg
# 3 month:  4 cyc/deg
# 224 pixels:
# 20 deg -> 11 pix in deg;  4.6 pix blur;  4 pix blur;  2.8 pix blur
# 4 deg -> 56 pix in deg; 23 pix blur (1 mo); 20 pix blur (2 mo); 14 pix blur (3 mo)

import torch
import torchvision
import torchvision.transforms as transforms
from torchvision import models
import torchvision.datasets
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os
import sys
import scipy
from torch.utils.data.sampler import SubsetRandomSampler
import h5py

args = sys.argv
modelType = args[1] # 'alexnet', 'squeezenet', 'vgg16'
numEpochs = args[2] # int
image_set = str(args[3]) # 'imagewoof', 'imagenette'
block_call = args[4] # int {0:4}

# Example call:
# python3 alexnet 100 imagenette 1

def get_train_valid_loader(data_dir,block,augment=0,random_seed=69420,valid_size=0.2,shuffle=False,
                                                show_sample=False,num_workers=4, pin_memory=False, batch_size=128):
        # valid_size gotta be in [0,1]
        # block must be an int between 0:(1/valid_size) (0:4 for valid_size==0.2)
        transform = transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
        )])
        train_dataset = torchvision.datasets.ImageFolder(root=data_dir,transform=transform)
        valid_dataset = torchvision.datasets.ImageFolder(root=data_dir,transform=transform)
        num_train = len(train_dataset)
        indices = list(range(num_train))
        split = int(np.floor(valid_size * num_train))
        split1 = int(np.floor(block*split))
        split2 = int(np.floor((block+1)*split))
        # if shuffle:
        np.random.seed(100)
        np.random.shuffle(indices)
        valid_idx = indices[split1:split2]
        train_idx = np.append(indices[:split1],indices[split2:])
        train_idx = train_idx.astype('int32')
        if block != 0:
                for b in range(block):
                        indices = [indices[(i + split) % len(indices)] for i, x in enumerate(indices)]
        # train_idx, valid_idx = indices[split:], indices[:split]
        train_sampler = SubsetRandomSampler(train_idx)
        # train_sampler = torch.utils.data.Subset(dataset, indices)
        valid_sampler = SubsetRandomSampler(valid_idx)
        train_loader = torch.utils.data.DataLoader(
                train_dataset, sampler=train_sampler, batch_size=batch_size,
                num_workers=num_workers, pin_memory=pin_memory,
        )
        valid_loader = torch.utils.data.DataLoader(
                valid_dataset, sampler=valid_sampler, batch_size=batch_size,
                num_workers=num_workers, pin_memory=pin_memory,
        )
        return (train_loader, valid_loader)

transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
 )])


blurTypes = ['gaussian']

data_dir = "/path/to/dir/" + image_set + "-320_blur/"


classes = []
for directory, subdirectories, files in os.walk(data_dir):
        for file in files:
                if directory.split("\\")[-1] not in classes:
                        classes.append(directory.split("\\")[-1])

criterion = nn.CrossEntropyLoss()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def train():

        for epoch in range(int(numEpochs)):
                prev_loss = 100000.0
                running_loss = 0.0
                for i, data in enumerate(trainloader, 0):
                        # get the inputs; data is a list of [inputs, labels]
                        inputs, labels = data
                        inputs = inputs.to(device)
                        labels = labels.to(device)

                        # zero the parameter gradients
                        optimizer.zero_grad()

                        # forward + backward + optimize
                        outputs = net(inputs)
                        loss = criterion(outputs, labels)
                        loss.backward()
                        optimizer.step()

                        running_loss += loss.item()

                if epoch % 10 == 9:
                        print('[%d, %5d] loss: %.3f' %
                                (epoch + 1, i + 1, running_loss / 100))

allAccs = []
for blurType in blurTypes: # multiple types of blur
        print(blurType)
        print('-' * 10)
        # for block in range(5):
        block = int(block_call)
        print("\nFOLD " + str(block+1) + ":")
        for i in range(5):
                if i == 0:
                        blurLevels = [23, 11, 5, 3, 1]
                elif i == 1:
                        blurLevels = [11, 5, 3, 1]
                elif i == 2:
                        blurLevels = [5, 3, 1]
                elif i == 3:
                        blurLevels = [3, 1]
                elif i == 4:
                        blurLevels = [1]

                if modelType == 'vgg16':
                        net = torchvision.models.vgg16(pretrained=False)
                        num_ftrs = net.classifier[6].in_features
                        net.classifier[6] = nn.Linear(num_ftrs, len(classes))
                elif modelType == 'alexnet':
                        net = torchvision.models.alexnet(pretrained=False)
                        num_ftrs = net.classifier[6].in_features
                        net.classifier[6] = nn.Linear(num_ftrs, len(classes))
                else:
                        net = torchvision.models.squeezenet1_1(pretrained=False)
                        net.classifier[1] = nn.Conv2d(512, len(classes), kernel_size=(1, 1), stride=(1, 1))
                        net.num_classes = len(classes)
                optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
                net = net.to(device)
                for i in range(len(blurLevels)): #5 levels of blur: 1, 3, 5, 11, 23
                        mult = blurLevels[i]

                        trainloader, validloader = get_train_valid_loader(data_dir=data_dir + blurType + '/' + image_set +
                                '-320_' + str(mult) + '/train',
                                block=block,shuffle=False,num_workers=0,batch_size=128)
                        print('Start training on blur window of ' + str(mult))
                        train()
                        print('Finished Training on ' + blurType + ' with blur window of ' + str(mult))

                accs = []
                permBlurLevels = [23, 11, 5, 3, 1]
                for j in range(len(permBlurLevels)):
                        tempMult = permBlurLevels[j]
                        correct = 0
                        total = 0
                        # newTestSet = torchvision.datasets.ImageFolder(root=data_dir + blurType + '/' + image_set + '-320_' +
                        #       str(tempMult) + '/val',
                        #       transform=transform)
                        # newTestLoader = torch.utils.data.DataLoader(newTestSet, batch_size=128,
                        #       shuffle=True, num_workers=0)
                        t2, validloader2 = get_train_valid_loader(data_dir=data_dir + blurType + '/' + image_set +
                                '-320_' + str(mult) + '/train',
                                block=block,shuffle=False,num_workers=0,batch_size=128)

                        with torch.no_grad():
                                for data in validloader2:
                                        images, labels = data
                                        images = images.to(device)
                                        labels = labels.to(device)
                                        outputs = net(images)
                                        _, predicted = torch.max(outputs.data, 1)
                                        total += labels.size(0)
                                        correct += (predicted == labels).sum().item()
                                        acc = 100 * correct / total
                        print('Accuracy: %f %%' % (acc))
                        accs.append(acc)
                allAccs.append(accs)

And here are the errors we see each time he runs this:

[2020-03-13T08:54:02.269] gres/gpu: count changed for node node002 from 0
to 1
[2020-03-13T08:54:02.269] error: Setting node node002 state to DRAIN
[2020-03-13T08:54:02.269] drain_nodes: node node002 state set to DRAIN
[2020-03-13T08:54:02.269] error: _slurm_rpc_node_registration node=node002:
Invalid argument

I could only find one reference to a 2015 SlurMD bug that wasn't a bug that even mentions this gres/gpu: count changed for node...

score 0 · Answer 1 · answered Sep 11 '20 at 19:36

The problem was the additional call to run_seq_blur2.py . We were getting a list index error, when you run the sbatch script you have to add the additional parameter to the sbatch command. The user modified it that way to make it easier to run permutations of the Python file without changing the sbatch script. For example:

sbatch run_seq_blur3.py 0

where 0 can be any value from 0 - 4.

The final line in the sbatch file now looks like this:

python3.6 SequentialBlur_untrained.py alexnet 100 imagewoof 0

Anyways, it no longer drains the node.

Slurm sbatch for a PyTorch script draining node; gres/gpu: count changed for node node002 from 0 to 1

1 Answers1

Linked