0

I am trying to train and test the simple network for segmentation. I have created the LMDB file using windows cpp.

But while doing training in windows cpp it get stuck in

 boost::shared_ptr<caffe::Solver<float> > solver(caffe::SolverRegistry<float>::CreateSolver(solver_param));

after Debugging I found it is stucking in a infinte loop of blocking queue. the flow is as follows.

layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id]) --> DataLayerSetUp(bottom, top) ----> Datum& datum = *(reader_.full().peek()) ----> Blocking Queue

It is displaying this Logs in executable window at runtime.

[layer_factory.hpp:77] Creating layer data;
[net.cpp:100] Creating Layer data;
[net.cpp:418] data -> data;

*********** mycaffe.cpp ***************

   Caffe::set_mode(Caffe::GPU);
        shared_ptr<Net<float> > net_;
        net_.reset(new Net<float>(model_file, caffe::Phase::TRAIN));

        Caffe::set_mode(Caffe::GPU);
        caffe::SolverParameter solver_param;
        caffe::ReadSolverParamsFromTextFileOrDie("mysolver.prototxt", &solver_param);
        boost::shared_ptr<caffe::Solver<float> > solver(caffe::SolverRegistry<float>::CreateSolver(solver_param));
        solver->Solve();

***************** mymodel.prototxt ********************

layer {
  name: "data"
  type: "Data"
  top: "data"

  include {
    phase: TRAIN
  }

  data_param {
    source: "data_lmdb"
    batch_size: 4
    backend: LMDB
  }

}

layer {
  name: "label"
  type: "Data"
  top: "label"

  include {
    phase: TRAIN
  }


  data_param {
    source: "label_lmdb"
    batch_size: 4
    backend: LMDB
  }

}
layer {
  name: "conv1"
  type: "Convolution"
  bottom: "data"
  top: "conv1"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 0.10000000149
  }
  convolution_param {
    num_output: 32
    pad: 1
    kernel_size: 3
    stride: 1
    weight_filler {
      type: "gaussian"
      std: 0.0010000000475
    }
    bias_filler {
      type: "constant"
      value: 0.0
    }
  }
}
layer {
  name: "relu1"
  type: "ReLU"
  bottom: "conv1"
  top: "conv1"
}
layer {
  name: "conv2"
  type: "Convolution"
  bottom: "conv1"
  top: "conv2"
  param {
    lr_mult: 1.0
  }
  param {
    lr_mult: 0.10000000149
  }
  convolution_param {
    num_output: 1024
    pad: 0
    kernel_size: 16
    stride: 16
    weight_filler {
      type: "gaussian"
      std: 0.0010000000475
    }
    bias_filler {
      type: "constant"
      value: 0.0
    }
  }
}
layer {
  name: "relu2"
  type: "ReLU"
  bottom: "conv2"
  top: "conv2"
}
layer {
  name: "upsample"
  type: "Deconvolution"
  bottom: "conv2"
  top: "upsample"
  param {
    lr_mult: 1.0
  }
  convolution_param {
    num_output: 1
    pad: 0
    kernel_size: 16
    stride: 16
    bias_filler {
      type: "constant"
      value: 128.0
    }
  }
}
layer {
  name: "lossL1"
  type: "SmoothL1Loss"
  bottom: "upsample"
  bottom: "label"
  top: "lossL1"
  loss_weight: 1.0
}

******************* mysolver.prototxt ************

test_initialization: false
base_lr: 0.01
display: 10
max_iter: 500000
lr_policy: "step"
gamma: 0.1
momentum: 0.9
weight_decay: 0.0001
stepsize: 4069
snapshot: 10000
snapshot_prefix: "snapshot"
solver_mode: GPU
net: "mymodel.prototxt"
solver_type: SGD

Please help me to figure out the mistake. Thank you.

AnkitSahu
  • 421
  • 4
  • 12
  • 1
    In what way does it get stuck in that routine? Is it in an infinite loop? Is it simply not progressing in loss function values? Without a tractably small example and your data, we can't reproduce the problem to trouble-shoot it. – Prune Jan 27 '17 at 00:55
  • It is stucking in Blocking Queue. in infinite loop of while (queue_.empty()) { sync_->condition_.wait(lock); } It is called from net.cpp and flow is as follows layers_[layer_id]->SetUp(bottom_vecs_[layer_id], top_vecs_[layer_id])--> DataLayerSetUp(bottom, top) ---> Datum& datum = *(reader_.full().peek()) ---> Blocking Queue – AnkitSahu Jan 27 '17 at 08:01
  • 1
    you may need to give the absolute path to the database – Kev1n91 Jan 27 '17 at 10:47
  • @ Kev1n91: I have tried with absolute path of lmdb but still not working ... – AnkitSahu Jan 27 '17 at 10:52
  • Is it because of some parameter missing in input Layer ?? – AnkitSahu Jan 27 '17 at 14:12

0 Answers0