How does changing batch size results in different prediction time?

Question

I trained a data set(~8000 images) using Caffe and a batch size of 5 with Alex net network. This results in a prediction time of (800-900)ms. Then i changed the batch size to 56(maximum my machine can support) and the prediction time reduced to (200-300)ms on cpu.

I can understand changing batch size using stochastic gradient descent can decrease training time and I know for Alex net I should be using batch size of 256, but I am using 56 because of my low configuration machine.

But how the batch size is affecting the prediction time on a single test data?

# AlexNet
name: "AlexNet"
layer {
  name: "train-data"
  type: "Data"
  top: "data"
  top: "label"
  transform_param {
    mirror: true
    crop_size: 227
  }
  data_param {
    batch_size: 128
  }
  include { stage: "train" }
}
layer {
  name: "val-data"
  type: "Data"
  top: "data"
  top: "label"
  transform_param {
    crop_size: 227
  }
  data_param {
    batch_size: 32
  }
  include { stage: "val" }
}
layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 96
    kernel_size: 11
    stride: 4
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "relu1"
  type: "ReLU"
  bottom: "conv1"
  top: "conv1"
}
layer {
  name: "norm1"
  type: "LRN"
  bottom: "conv1"
  top: "norm1"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layer {
  name: "pool1"
  type: "Pooling"
  bottom: "norm1"
  top: "pool1"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "conv2"
  type: "Convolution"
  bottom: "pool1"
  top: "conv2"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 2
    kernel_size: 5
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu2"
  type: "ReLU"
  bottom: "conv2"
  top: "conv2"
}
layer {
  name: "norm2"
  type: "LRN"
  bottom: "conv2"
  top: "norm2"
  lrn_param {
    local_size: 5
    alpha: 0.0001
    beta: 0.75
  }
}
layer {
  name: "pool2"
  type: "Pooling"
  bottom: "norm2"
  top: "pool2"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "conv3"
  type: "Convolution"
  bottom: "pool2"
  top: "conv3"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "relu3"
  type: "ReLU"
  bottom: "conv3"
  top: "conv3"
}
layer {
  name: "conv4"
  type: "Convolution"
  bottom: "conv3"
  top: "conv4"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 384
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu4"
  type: "ReLU"
  bottom: "conv4"
  top: "conv4"
}
layer {
  name: "conv5"
  type: "Convolution"
  bottom: "conv4"
  top: "conv5"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  convolution_param {
    num_output: 256
    pad: 1
    kernel_size: 3
    group: 2
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu5"
  type: "ReLU"
  bottom: "conv5"
  top: "conv5"
}
layer {
  name: "pool5"
  type: "Pooling"
  bottom: "conv5"
  top: "pool5"
  pooling_param {
    pool: MAX
    kernel_size: 3
    stride: 2
  }
}
layer {
  name: "fc6"
  type: "InnerProduct"
  bottom: "pool5"
  top: "fc6"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu6"
  type: "ReLU"
  bottom: "fc6"
  top: "fc6"
}
layer {
  name: "drop6"
  type: "Dropout"
  bottom: "fc6"
  top: "fc6"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "fc7"
  type: "InnerProduct"
  bottom: "fc6"
  top: "fc7"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    num_output: 4096
    weight_filler {
      type: "gaussian"
      std: 0.005
    }
    bias_filler {
      type: "constant"
      value: 0.1
    }
  }
}
layer {
  name: "relu7"
  type: "ReLU"
  bottom: "fc7"
  top: "fc7"
}
layer {
  name: "drop7"
  type: "Dropout"
  bottom: "fc7"
  top: "fc7"
  dropout_param {
    dropout_ratio: 0.5
  }
}
layer {
  name: "fc8"
  type: "InnerProduct"
  bottom: "fc7"
  top: "fc8"
  param {
    lr_mult: 1
    decay_mult: 1
  }
  param {
    lr_mult: 2
    decay_mult: 0
  }
  inner_product_param {
    # Since num_output is unset, DIGITS will automatically set it to the
    #   number of classes in your dataset.
    # Uncomment this line to set it explicitly:
    #num_output: 1000
    weight_filler {
      type: "gaussian"
      std: 0.01
    }
    bias_filler {
      type: "constant"
      value: 0
    }
  }
}
layer {
  name: "accuracy"
  type: "Accuracy"
  bottom: "fc8"
  bottom: "label"
  top: "accuracy"
  include { stage: "val" }
}
layer {
  name: "loss"
  type: "SoftmaxWithLoss"
  bottom: "fc8"
  bottom: "label"
  top: "loss"
  exclude { stage: "deploy" }
}
layer {
  name: "softmax"
  type: "Softmax"
  bottom: "fc8"
  top: "softmax"
  include { stage: "deploy" }
}

Are you using the same batch size in prediction for the two models, independent of the batch size used to train? — Prune, Feb 28 '17 at 18:56
If you are asking about changing batch size in solver.prototxt in the trained model, then no. I did not changed anything in the trained model or related files. Excuse me if this is not what you were asking, i am beginner to caffe. — danishansari, Feb 28 '17 at 19:02
The question is how many prediction requests you sent to each model. Was it the same for both? — Prune, Feb 28 '17 at 19:44
yes i used same testing data for both model and besides time stated is for individual frame. — danishansari, Mar 01 '17 at 05:04
Right -- however, if you send a batch of 50 images to handle at once and then divide the total time by 50, you'll get a much lower per-image figure than if you send 50 individual requests. — Prune, Mar 01 '17 at 21:55
Thanks for your response but my requirement force me to perform per frame classification. Did you find any problem with my network model i have added. — danishansari, Mar 02 '17 at 04:11
No, because (1) I'm not going to read through the topology to find possible discrepancies from the original. There are text comparison tools for that, and that would be your task; (2) one proper analysis would be per-layer timing -- again, there are tools for that and it's your task; (3) what you posted is the model *topology*, rather than the trained models. I'm hopeful that you didn't change the topology, although it appears that you've at least renamed the data layers. — Prune, Mar 03 '17 at 17:08

score 1 · Answer 1 · answered Feb 28 '17 at 18:44

1

If those are also your prediction batch sizes, then the speed increase is merely parallelization of the scoring process. However, if the speed is measured properly with identical batch sizes during prediction ...

This depends a lot on your hardware and any short-cuts involved. Since you haven't displayed the models you trained, it's hard to tell. One hypothesis I have is that your second model managed to eliminate more of the trained parameters from affecting the final decision (i.e. weight = 0.0), and that your software optimizations or hardware short-cuts subsequently sped up the computations.

Another possibility is that the larger model is actually richer, such that compiling the model results in using on-chip matrix operations rather than individual sparse-matrix operations (which could be slower, if you got unlucky). I doubt that this is the case.

answered Feb 28 '17 at 18:44

Prune

76,765
14
60
81

can you explain the last paragraph. – Deepak Feb 28 '17 at 18:49
I don't consider it likely, but I find it possible that sparse parameters and weights could engender machine code that would run more slowly than full-matrix operations. Many machine-learning computers now sport chips (e.g. Xeon Phi) with matrix operations in hardware. – Prune Feb 28 '17 at 18:53
@Prune So basically you are saying, changing batch size will not affect prediction time. – danishansari Feb 28 '17 at 18:54
I'm saying that I don't think this second case is likely; this shouldn't give you a 3-4x performance difference. Again, we can only brainstorm, not diagnose, since you haven't found a way to post the models in digestible form. – Prune Feb 28 '17 at 18:59
that seem pretty obvious that the second case wont give this much of performance gain on the same machine. – danishansari Feb 28 '17 at 19:06

How does changing batch size results in different prediction time?

1 Answers1