I'm writing some variable length string feature to tfrecord. If the feature has the same shape for all examples, it runs perfectly fine without problems. If the shape varies, the error below is raised whenever the created tfrecord is being read.
import random
import numpy as np
import tensorflow as tf
def serialize_example(writer):
# s = np.array(['aaa' for _ in range(10)]) # this works fine
s = np.array(['aaa' for _ in range(random.randint(1, 100))])
features = {
'f1': tf.train.Feature(
bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(s).numpy()])
)
}
example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(example.SerializeToString())
def create_tfrecord(output_path):
with tf.io.TFRecordWriter(output_path) as writer:
for i in range(total := 100):
print(f'\rWriting example: {i + 1}/{total}', end='')
serialize_example(writer)
def read_example(example, feature_map):
features = tf.io.parse_single_example(example, feature_map)
f1 = tf.sparse.to_dense(features['f1'])
f1 = tf.io.parse_tensor(f1[0], tf.string)
return f1
def read_tfrecord(fp, batch_size):
files = tf.data.Dataset.list_files(fp)
dataset = files.flat_map(tf.data.TFRecordDataset)
feature_map = {
'f1': tf.io.VarLenFeature(tf.string),
}
return dataset.map(
lambda x: read_example(x, feature_map),
tf.data.experimental.AUTOTUNE,
).batch(batch_size) # if this is removed, both cases work fine
if __name__ == '__main__':
create_tfrecord('xyz.tfrecord')
dataset = read_tfrecord('xyz.tfrecord', 8)
sample = dataset.take(1).as_numpy_iterator().next()
Error:
tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot add tensor to the batch: number of elements does not match. Shapes are: [tensor]: [83], [batch]: [32] [Op:IteratorGetNext]
If .batch(batch_size)
is removed, it works perfectly fine for both cases. I'm expecting replacing .batch
with .padded_batch(batch_size)
would fix the problem however, thanks to tensorflow's brilliant implementation which produces unknown shapes, this is quite impossible as well.
ValueError: You must provide `padded_shapes` argument because component 0 has unknown rank.
And of course, it's impossible to know the missing padded_shapes
in read_example
.