Understanding K.ctc_decode

Question

Here is Keras test test_ctc_decode_greedy for ctc_decode.

Here is my slightly modified example:

def test_ctc_decode_greedy():
    def _remove_repeats(inds):
        is_not_repeat = np.insert(np.diff(inds).astype(np.bool), 0, True)
        return inds[is_not_repeat]

    def _remove_blanks(inds, n_classes):
        return inds[inds < (n_classes - 1)]

    def ctc_decode_np(y_pred, input_length):
        # Note:
        # Last element in alphabet treated as blank character
        # decoded_dense padded with -1

        n_samples = y_pred.shape[0]
        n_classes = y_pred.shape[-1]
        log_prob = np.zeros((n_samples, 1))
        decoded_dense = -np.ones_like(y_pred[..., 0])
        decoded_length = np.zeros((n_samples,), dtype=np.int)

        for i in range(n_samples):
            print('-'*60)
            # [n_time_steps, alphabet_size]
            prob = y_pred[i]

            length = input_length[i]

            decoded = np.argmax(prob[:length], axis=-1)

            print('decoded:', decoded)

            log_prob[i] = -np.sum(np.log(prob[np.arange(length), decoded]))

            decoded = _remove_repeats(decoded)

            print('decoded remove_repeats:', decoded)

            decoded = _remove_blanks(decoded, n_classes)

            print('decoded remove_blanks:', decoded)

            decoded_length[i] = len(decoded)
            decoded_dense[i, :len(decoded)] = decoded

        print('-' * 60)

        return decoded_dense[:, :np.max(decoded_length)], log_prob

    n_time_steps = 6
    alphabet_size = 4

    seq_len_0 = 4
    input_prob_matrix_0 = np.asarray(
        [[1.0, 0.0, 0.0, 0.0],  # t=0
         [0.0, 0.0, 0.4, 0.6],  # t=1
         [0.0, 0.0, 0.4, 0.6],  # t=2
         [0.0, 0.9, 0.1, 0.0],  # t=3
         [0.0, 0.0, 0.0, 0.0],  # t=4 (ignored)
         [0.0, 0.0, 0.0, 0.0]], # t=5 (ignored)
        dtype=np.float32)

    seq_len_1 = 5
    input_prob_matrix_1 = np.asarray(
        [[0.1, 0.9, 0.0, 0.0],  # t=0
         [0.0, 0.9, 0.1, 0.0],  # t=1
         [0.0, 0.0, 0.1, 0.9],  # t=2
         [0.0, 0.9, 0.1, 0.1],  # t=3
         [0.9, 0.1, 0.0, 0.0],  # t=4
         [0.0, 0.0, 0.0, 0.0]], # t=5 (ignored)
        dtype=np.float32)

    # [batch_size, max_chars_in_text, alphabet_size]
    inputs = np.array([input_prob_matrix_0, input_prob_matrix_1])
    print('inputs.shape', inputs.shape)

    # [batch_size, ]
    input_length = np.array([seq_len_0, seq_len_1], dtype=np.int32)
    print('input_length.shape', input_length.shape)

    decode_pred_np, log_prob_pred_np = ctc_decode_np(inputs, input_length)

    # max_decoded_text_lenght depends on batch, other shorter samples in batch padded with -1
    # [batch_size, max_decoded_text_lenght]
    print('decode_pred_np.shape', decode_pred_np.shape)
    print(decode_pred_np)

    # [batch_size, 1]
    print('log_prob_pred_np.shape', log_prob_pred_np.shape)
    print(log_prob_pred_np)

    inputs = K.variable(inputs)
    input_length = K.variable(input_length)
    decode_pred_tf, log_prob_pred_tf = K.ctc_decode(inputs, input_length, greedy=True)
    assert len(decode_pred_tf) == 1
    decode_pred = K.eval(decode_pred_tf[0])
    log_prob_pred = K.eval(log_prob_pred_tf)

    assert np.alltrue(decode_pred_np == decode_pred)
    assert np.allclose(log_prob_pred_np, log_prob_pred)

Output:

------------------------------------------------------------
keras version: 2.1.6
tensorflow version: 1.14.0
------------------------------------------------------------
inputs.shape (2, 6, 4)
input_length.shape (2,)
------------------------------------------------------------
decoded: [0 3 3 1]
decoded remove_repeats: [0 3 1]
decoded remove_blanks: [0 1]
------------------------------------------------------------
decoded: [1 1 3 1 0]
decoded remove_repeats: [1 3 1 0]
decoded remove_blanks: [1 1 0]
------------------------------------------------------------
decode_pred_np.shape (2, 3)
[[ 0.  1. -1.]
 [ 1.  1.  0.]]
log_prob_pred_np.shape (2, 1)
[[1.12701166]
 [0.52680272]]

Questions:

At inference time how we get the number of valid time steps of prediction(in example some last rows are ignored)? As I understand at inference we have matrix of the fixed size as output from the network, do we need another output to predict number of valid timesteps?
As I understand log_prob is negative log likelihood (less value is better -> network is more confident in output), but because number of time steps can be different, values of log_prob are not comparable between the samples with different time steps length(because I don't see any normalization on length)?

UPDATE:

To put more context here is my more readable example with real alphabet and text encoding/decoding, here I use input_length = n_time_steps:

def test_ctc_decode_greedy():
    def labels_to_text(labels, alphabet):
        chars = []
        for i in labels:
            chars.append(alphabet[i])
        text = ''.join(chars)
        return text

    def _remove_repeats(inds):
        is_not_repeat = np.insert(np.diff(inds).astype(np.bool), 0, True)
        return inds[is_not_repeat]

    def _remove_blanks(inds, n_classes):
        return inds[inds < (n_classes - 1)]

    def ctc_decode_np(y_pred, input_length):
        # Note:
        # Last element in alphabet treated as blank character
        # decoded_dense padded with -1

        n_samples = y_pred.shape[0]
        n_classes = y_pred.shape[-1] # len(alphabet) + 1
        log_prob = np.zeros((n_samples, 1))
        decoded_dense = -np.ones_like(y_pred[..., 0])

        print('n_samples', n_samples) #
        print('n_classes', n_classes) #
        print('log_prob.shape', log_prob.shape) #
        print('decoded_dense.shape', decoded_dense.shape) #

        for i in range(n_samples):
            print('-'*60)
            # [n_time_steps, alphabet_size]
            prob = y_pred[i]
            length = input_length[i]

            decoded = np.argmax(prob[:length], axis=-1)

            print('decoded:', decoded, labels_to_text(decoded, alphabet)) #

            print('prob[np.arange(length), decoded] :', prob[np.arange(length), decoded]) #
            print('np.log(prob[np.arange(length), decoded]) :', np.log(prob[np.arange(length), decoded])) #

            log_prob[i] = -np.sum(np.log(prob[np.arange(length), decoded]))
            print('log_prob[i]', log_prob[i]) #

            decoded = _remove_repeats(decoded)

            print('decoded: remove_repeats:', decoded, labels_to_text(decoded, alphabet))  #

            decoded = _remove_blanks(decoded, n_classes)

            print('decoded: remove_blanks:', decoded, labels_to_text(decoded, alphabet))  #

            decoded_dense[i, :len(decoded)] = decoded

        print('-' * 60)

        return decoded_dense, log_prob

    n_time_steps = 8
    alphabet = ['h', 'e', 'l', 'o', '-']
    alphabet_size = len(alphabet)

    seq_len_0 = n_time_steps
    # 'hhel-lo-' after decoding should be 'hello'
    input_prob_matrix_0 = np.asarray(
        [[1.0, 0.0, 0.0, 0.0, 0.0],  # t=0 # 'h'
         [1.0, 0.0, 0.0, 0.0, 0.0],  # t=1 # 'h'
         [0.0, 1.0, 0.0, 0.0, 0.0],  # t=2 # 'e'
         [0.0, 0.0, 1.0, 0.0, 0.0],  # t=3 # 'l'
         [0.0, 0.0, 0.0, 0.0, 1.0],  # t=4 # '-'
         [0.0, 0.0, 1.0, 0.0, 0.0],  # t=5 # 'l'
         [0.0, 0.0, 0.0, 1.0, 0.0],  # t=6 # 'o'
         [0.0, 0.0, 0.0, 0.0, 1.0]], # t=7 # '-'
        dtype=np.float32)

    # [batch_size, n_time_steps, alphabet_size]
    inputs = np.array([input_prob_matrix_0])
    print('inputs.shape', inputs.shape) #

    # [batch_size, ]
    input_length = np.array([seq_len_0], dtype=np.int32)
    print('input_length.shape', input_length.shape) #

    decode_pred_np, log_prob_pred_np = ctc_decode_np(inputs, input_length)

    # [batch_size, n_time_steps]
    print('decode_pred_np.shape', decode_pred_np.shape) #
    print(decode_pred_np)

    # [batch_size, 1]
    print('log_prob_pred_np.shape', log_prob_pred_np.shape) #
    print(log_prob_pred_np)

Output:

------------------------------------------------------------
inputs.shape (1, 8, 5)
input_length.shape (1,)
n_samples 1
n_classes 5
log_prob.shape (1, 1)
decoded_dense.shape (1, 8)
------------------------------------------------------------
decoded: [0 0 1 2 4 2 3 4] hhel-lo-
prob[np.arange(length), decoded] : [1. 1. 1. 1. 1. 1. 1. 1.]
np.log(prob[np.arange(length), decoded]) : [0. 0. 0. 0. 0. 0. 0. 0.]
log_prob[i] [-0.]
decoded: remove_repeats: [0 1 2 4 2 3 4] hel-lo-
decoded: remove_blanks: [0 1 2 2 3] hello
------------------------------------------------------------
decode_pred_np.shape (1, 8)
[[ 0.  1.  2.  2.  3. -1. -1. -1.]]
log_prob_pred_np.shape (1, 1)
[[-0.]]

Regarding question NR1: if some of time steps are needless then they will result to blank character in the prediction, which are gonna be filtered, so I don't think there is need for another output. Question NR2: if there is no need for another output, then the number of timesteps is the same, so there are no problems with loss function — Vadim Plăcintă, May 07 '20 at 11:11
Why then `K.ctc_decode` have `input_length` argument, i.e. sequence length can vary? i.e. it can just infer max sequence length by the shape of `inputs`. — mrgloom, May 07 '20 at 18:59
Anyway, input_length should be one of the four arrays you already fed to ctc_loss, that one which contains the sequence length for each batch item in y_pred. I think you asked about some 5th array, I don't think it should exist. — Vadim Plăcintă, May 08 '20 at 20:44
As I understand at inference time you just have input `image` that produce output of network (here in example it's called `inputs`) with dimensions `[batch_size, max_time_steps, n_classes]`, so the question is how we can get "number of valid time steps" or should we always use max sequence length, if so why `K.ctc_decode` need `input_length` parameter? — mrgloom, May 10 '20 at 00:25

Understanding K.ctc_decode

0 Answers0