1

Note: this question started differently, but I deleted all previous (now unnecessary) information.

I have a CsvDataset which consists of a label (float) and a text (string). I want to transform every line so that I can feed it into a pretrained Bert model. Unfortunately, I cannot get past the .map function

files = glob.glob("example*.tsv")
d = tf.data.experimental.CsvDataset(files, 
    [tf.float32, tf.string], 
    select_cols=[3,4], 
    field_delim="\t", 
    header=True)
parsed_dataset = d.map(lambda label, text: tf.py_func(_decode_record, [label, text], [tf.float32, tf.string]))

def _decode_record(label, text):
    """Decodes a row to a TensorFlow example."""
    label_list = [1, 2, 3, 4, 5]
    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i
    tokens_a = tokenizer.tokenize(text)
    # Account for [CLS] and [SEP] with "- 2"
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0: (max_seq_length - 2)]
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = label_map[label]
    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)
    features["label_ids"] = create_int_feature([label_id])
    features["is_real_example"] = create_int_feature(
        [int(True)])
    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
    return tf_example

This breaks with: tensorflow.python.framework.errors_impl.UnimplementedError: Unsupported object type Example [[{{node PyFunc}}]] [Op:IteratorGetNextSync]

Jonathan R
  • 3,652
  • 3
  • 22
  • 40
  • 1
    Are you trying to convert a single row of a csv file to tfrecords? – Sharky May 20 '19 at 22:56
  • 1
    Possible duplicate of [how to get string value out of tf.tensor which dtype is string](https://stackoverflow.com/questions/56122670/how-to-get-string-value-out-of-tf-tensor-which-dtype-is-string) – giser_yugang May 21 '19 at 01:08
  • I am trying to convert a dataset made of multiple csv files into a dataset made of tf.Example where each feature is in the form that is required by Bert (eg. string -> array of input ids of typ e int64) – Jonathan R May 21 '19 at 08:35
  • Thanks Giser, this looks promising. Still I think this is not exactly what I need because as I see it the map function here is called only once per file and not for every Element in that file (every row in a csv)? Correct me if I am wrong – Jonathan R May 21 '19 at 08:38

1 Answers1

0

I found a solution to the problem. Below code does the job. My problem was that I misunderstood the Tout parameter of tf.py_func

def _convert(label, text):
    """Decodes a csv-line to a TensorFlow Example, serialized as a string."""
    label_list = [1, 2, 3, 4, 5]
    label_map = {}
    for (i, label) in enumerate(label_list):
        label_map[label] = i
    tokens_a = tokenizer.tokenize(text)
    # Account for [CLS] and [SEP] with "- 2"
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0: (max_seq_length - 2)]
    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    label_id = label_map[label]
    print("types", type(label_id), type(input_ids))
    features = collections.OrderedDict()
    features["input_ids"] = create_int_feature(input_ids)
    features["input_mask"] = create_int_feature(input_mask)
    features["segment_ids"] = create_int_feature(segment_ids)
    features["label_ids"] = create_int_feature([label_id])
    features["is_real_example"] = create_int_feature(
        [int(True)])
    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
    # we cannot return the example here because tf.py_func only accepts true tf datatypes
    return tf_example.SerializeToString()

name_to_features = {
    'input_ids': tf.FixedLenFeature([128], tf.int64),
    'input_mask': tf.FixedLenFeature([128], tf.int64),
    'segment_ids': tf.FixedLenFeature([128], tf.int64),
    'label_ids': tf.FixedLenFeature([1], tf.int64),
    'is_real_example': tf.FixedLenFeature([1], tf.int64)
}

def _decode_record(record):
    """Decodes a record to a TensorFlow example."""
    example = tf.parse_single_example(record, name_to_features)
    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in list(example.keys()):
      t = example[name]
      if t.dtype == tf.int64:
        t = tf.to_int32(t)
      example[name] = t
    print(example)
    return example

parsed_dataset = d.map(lambda label, text: tf.py_func(_convert, [label, text], tf.string))
parsed_dataset = parsed_dataset.map(_decode_record)

Note that this solution uses tf.py_func and is therefore not useable with accelerators like GPU or TPU

Jonathan R
  • 3,652
  • 3
  • 22
  • 40