I set up an ocr classification system using Tensorflow.
Here is graph:
def build_graph(top_k):
# with tf.device('/cpu:0'):
keep_prob = tf.placeholder(dtype=tf.float32, shape=[], name='keep_prob')
images = tf.placeholder(dtype=tf.float32, shape=[None, 64, 64, 1], name='image_batch')
labels = tf.placeholder(dtype=tf.int64, shape=[None], name='label_batch')
conv_1 = slim.conv2d(images, 64, [3, 3], 1, padding='SAME', scope='conv1')
max_pool_1 = slim.max_pool2d(conv_1, [2, 2], [2, 2], padding='SAME')
conv_2 = slim.conv2d(max_pool_1, 128, [3, 3], padding='SAME', scope='conv2')
max_pool_2 = slim.max_pool2d(conv_2, [2, 2], [2, 2], padding='SAME')
conv_3 = slim.conv2d(max_pool_2, 256, [3, 3], padding='SAME', scope='conv3')
max_pool_3 = slim.max_pool2d(conv_3, [2, 2], [2, 2], padding='SAME')
flatten = slim.flatten(max_pool_3)
fc1 = slim.fully_connected(slim.dropout(flatten, keep_prob), 1024, activation_fn=tf.nn.tanh, scope='fc1')
logits = slim.fully_connected(slim.dropout(fc1, keep_prob), FLAGS.charset_size, activation_fn=None, scope='fc2')
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels))
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits, 1), labels), tf.float32))
global_step = tf.get_variable("step", [], initializer=tf.constant_initializer(0.0), trainable=False)
rate = tf.train.exponential_decay(2e-4, global_step, decay_steps=2000, decay_rate=0.97, staircase=True)
train_op = tf.train.AdamOptimizer(learning_rate=rate).minimize(loss, global_step=global_step)
probabilities = tf.nn.softmax(logits)
tf.summary.scalar('loss', loss)
tf.summary.scalar('accuracy', accuracy)
merged_summary_op = tf.summary.merge_all()
predicted_val_top_k, predicted_index_top_k = tf.nn.top_k(probabilities, k=top_k, name="predicted_top_k")
accuracy_in_top_k = tf.reduce_mean(tf.cast(tf.nn.in_top_k(probabilities, labels, top_k), tf.float32))
return {'images': images,
'labels': labels,
'keep_prob': keep_prob,
'top_k': top_k,
'global_step': global_step,
'train_op': train_op,
'loss': loss,
'accuracy': accuracy,
'accuracy_top_k': accuracy_in_top_k,
'merged_summary_op': merged_summary_op,
'predicted_distribution': probabilities,
'predicted_index_top_k': predicted_index_top_k,
'predicted_val_top_k': predicted_val_top_k}
Here is inference function:
def inference(images, pbfile="pb/ocr.pb"):
print('inference')
start = time.time()
predicted_val_top_k = graph.get_tensor_by_name('ocr/predicted_top_k:0')
predicted_index_top_k = graph.get_tensor_by_name('ocr/predicted_top_k:1')
tensor_image = graph.get_tensor_by_name('ocr/image_batch:0')
keep_prob = graph.get_tensor_by_name('ocr/keep_prob:0')
probabilities = graph.get_tensor_by_name('ocr/Softmax:0')
logits = graph.get_tensor_by_name('ocr/fc2/BiasAdd:0')
end = time.time()
print('takes %s second to get tensor' % (start - end))
result = []
for image in images:
temp_image = Image.open(image).convert('L')
temp_image = temp_image.resize((IMAGE_SIZE, IMAGE_SIZE), Image.ANTIALIAS)
temp_image = np.asarray(temp_image) / 255.0
temp_image = temp_image.reshape([-1, 64, 64, 1])
start = time.time()
logit, prob, predict_val, predict_index = sess.run([logits, probabilities, predicted_val_top_k, predicted_index_top_k],
feed_dict={tensor_image: temp_image, keep_prob: 1.0})
end = time.time()
print('takes %s second to run tensor' % (start - end))
result.append({'image': image, 'val': predict_val, 'index': predict_index})
document_dict = {
0: 'V',
1: 'X',
2: 'U'
}
image_name = image.split('/')[-1]
return result
We have only three classes now, i.e. 'V', 'X', 'U', and everything is OK if the target we wanna detect belongs to the three types.
However, the issue comes when we detect a target not belongs to the candidate there types, e.g. we now put an 'A' for inference, problem is, we also got class 'X' for 'A' which is obviously incorrect.
Then, I wanna distinguish the other ones by setting a score threshold.
We know that tf.nn.softmax returns something like scores, and, when I debug, I find that the score(predict_val in inference function) to class 'X' for target 'A' is almost 1(actually 0.9999..).
Then, I think it is reasonable after dig into the softmax:
softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
It only perform operations on the current classes logits.
So, is there any way to add an unknown class for softmax involving all other targets?
Env: Python3.6.5; Tensorflow 1.8.0
Thanks.
Wesley