I tried to train an image classifier based on MobilenetV2, but loss has not been able to converge, I am not sure if I use tensorflow correctly

Question

I tried to use the MobilenetV2 model as an image classifier. There are 10 categories, which are randomly selected from ImageNet's classification dataset. But loss does not converge during training, it always tends to fluctuate back and forth with a large value. I don't know if I use the tensorflow API correctly, because I don't know much about the eager mode in the new version. Can anyone help me? Many thanks.

The version I am using is 1.14. I tried to run the code directly on version 2.0 and I got similar results. The model I am using is from https://github.com/qxde01/keras-alchemy/blob/master/models/mobilenet_v2.py

Below is my training code and data set processing code:

# train code
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging
import numpy as np
import os
import random
import tensorflow as tf

from config import cfg
from mobile_net_v2.dataset import ImageNet_Dataset
from mobile_net_v2 import mobilenet_v2

logger = logging.getLogger("train")
logger.setLevel(logging.INFO)

tf.compat.v1.enable_eager_execution()
tf.keras.backend.clear_session()


def set_random_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.compat.v1.set_random_seed(seed)

def main():
    cfg.merge_from_file(args.cfg)

    train_dataset = ImageNet_Dataset(**cfg.DATASET.IMGNET)
    def get_train_data_by_cv2(index):
        img, cls_label, path = train_dataset[index]
        return img, cls_label, path
    index_sum = len(train_dataset.imgs_path_list)
    assert index_sum != 0, 'Dataset information read error'
    index = list(range(0, index_sum))
    random.shuffle(index)
    dataset = tf.data.Dataset.from_tensor_slices(index)
    dataset = dataset.map(lambda index: tf.py_function(
        get_train_data_by_cv2, [index], [tf.float32, tf.int32, tf.string]),
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(32).repeat(10).shuffle(32)

    MNV2_Model = mobilenet_v2.MobileNetV2(include_top=True, input_shape=(127, 127, 3), alpha=0.5, classes=cfg.DATASET.IMGNET.MAX_NUM_OF_CLASS)

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
    # optimizer = tf.keras.optimizers.Adam(learning_rate=cfg.TRAIN.BASE_LR)
    global_step = tf.Variable(0, dtype=tf.int64)

    checkpoint = tf.train.Checkpoint(Model=MNV2_Model , Global_Step=global_step, Optimizer=optimizer)
    ckp_manager = tf.train.CheckpointManager(checkpoint, directory=args.ckp_dir, max_to_keep=args.num_of_ckp_to_keep)
    if ckp_manager.latest_checkpoint:
        print("Restored from {}".format(ckp_manager.latest_checkpoint))
        checkpoint.restore(ckp_manager.latest_checkpoint)
    else:
        print("Initializing from scratch.")

    # tensorboard
    summary_writer = tf.compat.v2.summary.create_file_writer(args.tsb_dir)

    batch_count_end = tf.convert_to_tensor(0, dtype=tf.float32)
    batch_count = 0
    grads_cache = []
    for batch_index, (img, label_cls, path) in enumerate(dataset):
        with tf.GradientTape() as tape:

            cls = MNV2_Model(img)

            batch, c = cls.shape
            # batch, h, w, c = cls.shape
            cls_reshape = tf.reshape(cls, [batch, -1])
            label_cls = tf.reshape(label_cls, [-1])
            # cls_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cls_reshape, labels=label_cls)
            cls_loss = tf.keras.losses.sparse_categorical_crossentropy(y_pred=cls_reshape, y_true=label_cls)
            cls_loss = tf.reduce_mean(cls_loss)
        # trainable_variables = MNV2_Model.trainable_variables
        # tape_watched_variables = tape.watched_variables()  

        gradients = tape.gradient(cls_loss, MNV2_Model.trainable_variables)
        len_ = len(gradients)
        if batch_count_end.numpy() == 0:
            optimizer.apply_gradients(zip(gradients, MNV2_Model.trainable_variables))
            logger.info("train step = {}, cls_loss = {:.5f}".format(global_step.numpy(), cls_loss.numpy()))
            with summary_writer.as_default():
                tf.compat.v2.summary.scalar("cls_loss", cls_loss, step=global_step)
                tf.compat.v2.summary.scalar("LR", optimizer.learning_rate, step=global_step)
        else:
            if batch_count == 0:
                grads_cache = gradients
                batch_count += 1
                logger.info("train step = {}, cls_loss = {:.5f}".format(global_step.numpy(), cls_loss.numpy()))
                with summary_writer.as_default():
                    tf.compat.v2.summary.scalar("cls_loss", cls_loss, step=global_step)
                    tf.compat.v2.summary.scalar("LR", optimizer.learning_rate, step=global_step)
            else:
                for ind in range(len_):
                    if gradients[ind] is None:
                        continue
                    grads_cache[ind] += gradients[ind]  
                if batch_count == batch_count_end.numpy():
                    batch_count = 0

                    for ind in range(len_):
                        if grads_cache[ind] is None:
                            continue
                        grads_cache[ind] = grads_cache[ind] / (batch_count_end + 1.)
                    optimizer.apply_gradients(zip(grads_cache, MNV2_Model.trainable_variables))
                else:
                    batch_count += 1
        if batch_index % args.ckp_save_interval == 0:
            ckp_manager.save()
        global_step = global_step + 1



if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    req_grp = parser.add_argument_group('required')
    req_grp.add_argument('--ckp_dir', default="./backbone_cp", help='directory to save checkpoints to')
    req_grp.add_argument('--ckp_save_interval', default=2000, help='interval between model checkpoints')
    req_grp.add_argument('--num_of_ckp_to_keep', default=5, help='the number of checkpoints to keep')
    req_grp.add_argument('--tsb_dir', default="./backbone_tsb", help='directory to save tensorboard info to')
    req_grp.add_argument('--cfg', type=str, default='../config_yaml/mobilenetv2_lt_config.yaml',
                         help='configuration of tracking')
    parser.add_argument('--seed', type=int, default=123456, help='random seed')
    args = parser.parse_args()

    logger.info("train start")
    if not os.path.exists(args.ckp_dir):
        abs_dir = os.path.abspath(args.ckp_dir)
        logger.warning("checkpoint directory does not exist , now we will crate this directory : "+abs_dir)
        os.makedirs(abs_dir)
    if not os.path.exists(args.tsb_dir):
        abs_dir = os.path.abspath(args.tsb_dir)
        logger.warning("tensorboard directory does not exist , now we will crate this directory : "+abs_dir)
        os.makedirs(abs_dir)
    assert os.path.exists(args.cfg), 'The configuration file does not exist, and the path may be filled incorrectly.'

    set_random_seed(args.seed)
    main()

# ILSVRC2012_CLASS dataset code
# -*- coding: UTF-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import cv2
import numpy as np
import tensorflow as tf
import random
import scipy.io
from config import cfg

class ImageNet_Dataset:
    def __init__(self, ROOT, WIND_ROOT, MAX_NUM_OF_CLASS=1000, ):
        self.WIND_To_ID_Dict = {}
        synsets = scipy.io.loadmat(WIND_ROOT)['synsets']
        for synset in synsets:
            self.WIND_To_ID_Dict[synset[0][1][0]] = synset[0][0][0][0]  # WIND=ID
        self.imgs_path_list = []
        self.img_label_list = []
        anno_file = os.path.join(ROOT, 'anno.txt')
        if not os.path.exists(anno_file):
            print("Tag file does not exist, create")
            count = 0
            for file_name in os.listdir(ROOT):
                path = os.path.join(ROOT, file_name)
                if os.path.isdir(path) and file_name[0] == "n":
                    this_class_img_list = os.listdir(path)
                    self.imgs_path_list += [os.path.join(path, img_name) for img_name in this_class_img_list]
                    this_img_label = self.WIND_To_ID_Dict[file_name] - 1
                    this_img_label_list = [this_img_label]*len(this_class_img_list)
                    self.img_label_list += this_img_label_list
                    count += 1
                print("finish:", path, "count", count)

            file = open(os.path.join(ROOT, 'anno.txt'), 'w')
            for img_path, label in zip(self.imgs_path_list, self.img_label_list):
                file.writelines(img_path + " " + "{}".format(label) + "\n")
            file.close()
        else:
            print("Tag file exists, read")
            for line in open(anno_file, encoding='utf-8'):
                path_and_label = line.strip("\n").split(' ')
                self.imgs_path_list += [path_and_label[0]]
                self.img_label_list += [int(path_and_label[1])]
            count_temp = 0
            val_mem = -1
            for index, label in enumerate(self.img_label_list):
                if label != val_mem:
                    val_mem = label
                    count_temp += 1
                    if label >= 1000:
                        print("Outlier：", label)
            print("A total of {} classes".format(count_temp))
        # Crop data set
        if MAX_NUM_OF_CLASS != 1000:
            # Randomly select N classes
            temp = list(np.arange(1000))
            random_choose_label = random.sample(temp, MAX_NUM_OF_CLASS)
            # Select class and path
            label_list_copy = self.img_label_list[:]
            path_list_copy = self.imgs_path_list[:]
            self.img_label_list = []
            self.imgs_path_list = []
            for index, label in enumerate(label_list_copy):
                if label in random_choose_label:
                    self.img_label_list.append(label_list_copy[index])
                    self.imgs_path_list.append(path_list_copy[index])
            # Select WIND
            choosed_WIND = {}
            # Recode tags and WIND
            val_mem = -1
            count_temp = -1
            for index, val in enumerate(self.img_label_list):
                if val != val_mem:
                    val_mem = val
                    count_temp += 1
                    for WIND in self.WIND_To_ID_Dict.keys():
                        if self.WIND_To_ID_Dict[WIND] == val:
                            choosed_WIND[WIND] = count_temp
                            print("Selected class：", WIND)
                            break
                self.img_label_list[index] = count_temp
            self.WIND_To_ID_Dict = choosed_WIND
            print("Final selection of {} classes".format(len(self.WIND_To_ID_Dict)))
            # Save the new WIND and ID table
            file = open(os.path.join(ROOT, 'anno_select.txt'), 'w')
            for KIND in self.WIND_To_ID_Dict.keys():
                file.writelines(KIND + " " + "{}".format(self.WIND_To_ID_Dict[KIND]) + "\n")
            file.close()
            file = open(os.path.join(ROOT, 'new_anno.txt'), 'w')
            for path, label in zip(self.imgs_path_list, self.img_label_list):
                file.writelines(path + " " + "{}".format(label) + "\n")
            file.close()

    def __getitem__(self, index):
        image = cv2.imread(self.imgs_path_list[index])
        label = self.img_label_list[index]
        image_resize = cv2.resize(image, (127, 127)).astype(np.float32)
        return image_resize, label, self.imgs_path_list[index]


if __name__ == "__main__":
    dataset = ImageNet_Dataset(**cfg.DATASET.IMGNET)
    path_list = dataset.imgs_path_list
    label_list = dataset.img_label_list
    WIND_ID = dataset.WIND_To_ID_Dict
    print("Total number of training pictures", len(dataset.imgs_path_list))
    check_list = [100, 1500, 2000, 1300, 188, 1503]
    for ind in check_list:
        im, label = dataset[ind]
        cv2.imshow("label {} ind {}".format(label, ind), im)
    cv2.waitKey()

The following is the print content of the training process:

2019-11-02 09:16:17.201821: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library nvcuda.dll
2019-11-02 09:16:17.289288: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: GeForce GTX 1050 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.493
pciBusID: 0000:01:00.0
2019-11-02 09:16:17.289592: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-11-02 09:16:17.291786: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-11-02 09:16:17.292160: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2019-11-02 09:16:17.295064: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: GeForce GTX 1050 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.493
pciBusID: 0000:01:00.0
2019-11-02 09:16:17.295369: I tensorflow/stream_executor/platform/default/dlopen_checker_stub.cc:25] GPU libraries are statically linked, skip dlopen check.
2019-11-02 09:16:17.297488: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2019-11-02 09:16:17.872697: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1181] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-11-02 09:16:17.872919: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1187]      0 
2019-11-02 09:16:17.873052: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1200] 0:   N 
2019-11-02 09:16:17.875796: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1326] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 3001 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1)
W1102 09:16:18.032456 14192 deprecation.py:323] From C:\software\Anaconda3\envs\CV_env\lib\site-packages\tensorflow\python\data\util\random_seed.py:58: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Initializing from scratch.
2019-11-02 09:16:20.087193: W .\tensorflow/core/framework/model.h:213] Encountered a stop event that was not preceded by a start event.
W1102 09:16:20.077111 15760 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.077111 16592 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.077111  6140 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.077111  1564 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
W1102 09:16:20.092706 15760 backprop.py:820] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.int32
I1102 09:16:22.997265 14192 train_backbone.py:97] train step = 0, cls_loss = 2.30259
I1102 09:16:23.513478 14192 train_backbone.py:97] train step = 1, cls_loss = 2.30025
I1102 09:16:23.750537 14192 train_backbone.py:97] train step = 2, cls_loss = 2.29853
I1102 09:16:23.980576 14192 train_backbone.py:97] train step = 3, cls_loss = 2.24046
I1102 09:16:24.480434 14192 train_backbone.py:97] train step = 4, cls_loss = 2.41200
I1102 09:16:24.714752 14192 train_backbone.py:97] train step = 5, cls_loss = 2.36562
I1102 09:16:24.949074 14192 train_backbone.py:97] train step = 6, cls_loss = 2.37916
I1102 09:16:25.183419 14192 train_backbone.py:97] train step = 7, cls_loss = 2.29218
I1102 09:16:25.402090 14192 train_backbone.py:97] train step = 8, cls_loss = 2.28306
I1102 09:16:25.637276 14192 train_backbone.py:97] train step = 9, cls_loss = 2.28749
I1102 09:16:25.887218 14192 train_backbone.py:97] train step = 10, cls_loss = 2.29699
I1102 09:16:26.121537 14192 train_backbone.py:97] train step = 11, cls_loss = 2.26547
I1102 09:16:26.355856 14192 train_backbone.py:97] train step = 12, cls_loss = 2.35975
I1102 09:16:26.590177 14192 train_backbone.py:97] train step = 13, cls_loss = 2.35546
I1102 09:16:26.813820 14192 train_backbone.py:97] train step = 14, cls_loss = 2.23688
I1102 09:16:27.049402 14192 train_backbone.py:97] train step = 15, cls_loss = 2.35449
I1102 09:16:27.314964 14192 train_backbone.py:97] train step = 16, cls_loss = 2.32679
I1102 09:16:27.564905 14192 train_backbone.py:97] train step = 17, cls_loss = 2.26078
I1102 09:16:27.799224 14192 train_backbone.py:97] train step = 18, cls_loss = 2.31926
I1102 09:16:28.028345 14192 train_backbone.py:97] train step = 19, cls_loss = 2.37051
I1102 09:16:28.262664 14192 train_backbone.py:97] train step = 20, cls_loss = 2.33416
I1102 09:16:28.481362 14192 train_backbone.py:97] train step = 21, cls_loss = 2.31699
I1102 09:16:28.715682 14192 train_backbone.py:97] train step = 22, cls_loss = 2.31238
I1102 09:16:28.965623 14192 train_backbone.py:97] train step = 23, cls_loss = 2.29348
I1102 09:16:29.371894 14192 train_backbone.py:97] train step = 24, cls_loss = 2.31252
I1102 09:16:29.624755 14192 train_backbone.py:97] train step = 25, cls_loss = 2.29014
I1102 09:16:29.868103 14192 train_backbone.py:97] train step = 26, cls_loss = 2.29235
I1102 09:16:30.104471 14192 train_backbone.py:97] train step = 27, cls_loss = 2.29885
I1102 09:16:30.343830 14192 train_backbone.py:97] train step = 28, cls_loss = 2.30771

（ellipsis）

I1102 09:31:27.488112 14192 train_backbone.py:97] train step = 3781, cls_loss = 2.29822
I1102 09:31:27.694538 14192 train_backbone.py:97] train step = 3782, cls_loss = 2.28998
I1102 09:31:27.898992 14192 train_backbone.py:97] train step = 3783, cls_loss = 2.29968
I1102 09:31:28.107434 14192 train_backbone.py:97] train step = 3784, cls_loss = 2.31657
I1102 09:31:28.313882 14192 train_backbone.py:97] train step = 3785, cls_loss = 2.33363
I1102 09:31:28.525316 14192 train_backbone.py:97] train step = 3786, cls_loss = 2.30077
I1102 09:31:28.735753 14192 train_backbone.py:97] train step = 3787, cls_loss = 2.29594
I1102 09:31:28.965138 14192 train_backbone.py:97] train step = 3788, cls_loss = 2.28513
I1102 09:31:29.176574 14192 train_backbone.py:97] train step = 3789, cls_loss = 2.30248
I1102 09:31:29.386014 14192 train_backbone.py:97] train step = 3790, cls_loss = 2.28849
I1102 09:31:29.595453 14192 train_backbone.py:97] train step = 3791, cls_loss = 2.28419
I1102 09:31:29.821846 14192 train_backbone.py:97] train step = 3792, cls_loss = 2.24379
I1102 09:31:30.029292 14192 train_backbone.py:97] train step = 3793, cls_loss = 2.31936
I1102 09:31:30.236737 14192 train_backbone.py:97] train step = 3794, cls_loss = 2.29435
I1102 09:31:30.442188 14192 train_backbone.py:97] train step = 3795, cls_loss = 2.29231
I1102 09:31:30.648634 14192 train_backbone.py:97] train step = 3796, cls_loss = 2.30401
I1102 09:31:30.855082 14192 train_backbone.py:97] train step = 3797, cls_loss = 2.28225
I1102 09:31:31.057540 14192 train_backbone.py:97] train step = 3798, cls_loss = 2.30102
I1102 09:31:31.268974 14192 train_backbone.py:97] train step = 3799, cls_loss = 2.27844
I1102 09:31:31.469438 14192 train_backbone.py:97] train step = 3800, cls_loss = 2.31878
I1102 09:31:31.676883 14192 train_backbone.py:97] train step = 3801, cls_loss = 2.31835
I1102 09:31:31.890312 14192 train_backbone.py:97] train step = 3802, cls_loss = 2.25618
I1102 09:31:32.096760 14192 train_backbone.py:97] train step = 3803, cls_loss = 2.31030
I1102 09:31:32.302210 14192 train_backbone.py:97] train step = 3804, cls_loss = 2.30390
I1102 09:31:32.510651 14192 train_backbone.py:97] train step = 3805, cls_loss = 2.29039
I1102 09:31:32.731063 14192 train_backbone.py:97] train step = 3806, cls_loss = 2.27882
I1102 09:31:32.961446 14192 train_backbone.py:97] train step = 3807, cls_loss = 2.33232
I1102 09:31:33.165899 14192 train_backbone.py:97] train step = 3808, cls_loss = 2.28104
I1102 09:31:33.379328 14192 train_backbone.py:97] train step = 3809, cls_loss = 2.28902

Process finished with exit code 0

score 1 · Answer 1 · answered Nov 04 '19 at 10:08

I will ask myself to answer. The reason I found the problem today is that the BN layer has not been updated. I blame this on the tensorflow team for not giving detailed documentation. In older versions, the BN layer needed to be updated manually. In eager mode, this manually updated code does not work. You need to add tf.keras.backend.set_learning_phase(True) before the training loop starts, and the training will work normally. Need to change to False before the test loop.

I tried to train an image classifier based on MobilenetV2, but loss has not been able to converge, I am not sure if I use tensorflow correctly

1 Answers1