I'd use a tf.data.Dataset
. With eager execution enabled:
import tensorflow as tf
import tensorflow.contrib.eager as tfe
tfe.enable_eager_execution()
def _decode_and_length_map(encoded_string):
decoded = tf.decode_raw(encoded_string, out_type=tf.uint8)
return decoded, tf.shape(decoded)[0]
inputs = tf.constant(["aaa", "bbbbbbbb", "abcde"], dtype=tf.string)
dataset = (tf.data.Dataset.from_tensor_slices(inputs)
.map(_decode_and_length_map)
.padded_batch(batch_size=2, padded_shapes=([None], [])))
iterator = tfe.Iterator(dataset)
print(iterator.next())
print(iterator.next())
Prints (disclaimer: manually reformatted)
(<tf.Tensor: id=24, shape=(2, 8), dtype=uint8,
numpy=array([[97, 97, 97, 0, 0, 0, 0, 0],
[98, 98, 98, 98, 98, 98, 98, 98]], dtype=uint8)>,
<tf.Tensor: id=25, shape=(2,), dtype=int32, numpy=array([3, 8], dtype=int32)>)
(<tf.Tensor: id=28, shape=(1, 5), dtype=uint8,
numpy=array([[ 97, 98, 99, 100, 101]], dtype=uint8)>,
<tf.Tensor: id=29, shape=(1,), dtype=int32, numpy=array([5], dtype=int32)>)
Of course you can mix and match data sources, add randomization, change the padding character, etc.
Also works with graph building:
import tensorflow as tf
def _decode_and_length_map(encoded_string):
decoded = tf.decode_raw(encoded_string, out_type=tf.uint8)
return decoded, tf.shape(decoded)[0]
inputs = tf.constant(["aaa", "bbbbbbbb", "abcde"], dtype=tf.string)
dataset = (tf.data.Dataset.from_tensor_slices(inputs)
.map(_decode_and_length_map)
.padded_batch(batch_size=2, padded_shapes=([None], [])))
batch_op = dataset.make_one_shot_iterator().get_next()
with tf.Session() as session:
print(session.run(batch_op))
print(session.run(batch_op))