Applying Tensorflow TextVectorization and StringIndexers to Dask Partitions on Parallel

Question

I am trying to build a pipeline to parallelize writing tfrecords files on datasets that are too large to fit into memory. I have successfully used dask to do this many times in the past, but I have a new dataset requiring that TextVectorization and StringIndexers be applied outside of the model (running them inside of the model was choking the GPU's). Ultimately I'm trying to apply the vectorizers/string indexers in series within a single partition and then process each of the partitions in parallel using Dask's computation engine. I have tried about 100 different ways to write the functions and have attempted to apply tf.keras.layers.deserialize and tf.keras.layers.serialize to the vectorizers without luck.

Here is the pipeline as it stands:


from dask.distributed import LocalCluster, Client
import logging
import dask
import json
import os
import pickle
import tensorflow
from tensorflow.keras.layers.experimental.preprocessing import StringLookup, TextVectorization

cluster = LocalCluster()

client = Client(cluster)

# Setup logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 
def load_and_broadcast_vectorizers(model_dir):
    logging.info("Loading and broadcasting vectorizers...")
    path = os.path.join(model_dir, 'vectorizers_saved')
    vectorizers = {}
   
    for file in os.listdir(path):
        if file.endswith("_vectorizer_config.json"):
            key = file[:-23]
            with open(os.path.join(path, f'{key}_vectorizer_config.json'), 'r') as f:
                config = json.load(f)
           
            vectorizer_class = config['vectorizer_class']
            config.pop('vectorizer_class', None)  # Remove the 'vectorizer_class' key
 
            if vectorizer_class == 'StringLookup':
                vectorizer = StringLookup.from_config(config)
            elif vectorizer_class == 'TextVectorization':
                vectorizer = TextVectorization.from_config(config)
            else:
                raise ValueError(f"Unknown vectorizer class: {vectorizer_class}")
 
            vectorizers[key] = vectorizer
    logging.info("Vectorizers loaded and broadcasted successfully.")
    return client.scatter(vectorizers, broadcast=True)
 
def apply_vectorizers_to_partition(partition, vectorizers):
    logging.info("Applying vectorizers to partition...")
    for vectorizer_name, vectorizer in vectorizers.items():
        partition[vectorizer_name] = vectorizer.transform(partition[vectorizer_name])
    return partition
 
def apply_vectorizers_in_parallel(vectorizers, df):
    logging.info("Applying vectorizers in parallel...")
    return df.map_partitions(apply_vectorizers_to_partition, vectorizers)
 
def write_partition_to_tfrecord(partition, output_path, partition_label, partition_id):
    logging.info(f"Writing {partition_label} partition {partition_id} to TFRecord...")
    file_name_tfrecord = f'{partition_label}_partition_{partition_id}.tfrecord'
    output_file_path_tfrecord = os.path.join(output_path, file_name_tfrecord)
    with tf.io.TFRecordWriter(output_file_path_tfrecord) as writer:
        for row in partition.itertuples():
                        # Extract features and label from each row
            features = {
                'input_1': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['input_1']])),
                'input_2': tf.train.Feature(float_list=tf.train.FloatList(value=[row['input_2']])),
                'input_3': tf.train.Feature(int64_list=tf.train.Int64List(value=row['input_3'])),
                'input_4': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['input_4']])),
                'input_5': tf.train.Feature(int64_list=tf.train.Int64List(value=[row['input_5']]))
    }

            label = tf.train.Feature(float_list=tf.train.FloatList(value=[row['label_col']]))
            example = tf.train.Example(features=tf.train.Features(feature={**features, **{'label': label}}))
           writer.write(example.SerializeToString())
    logging.info(f"{partition_label} partition {partition_id} written to TFRecord.")
 
def write_partition_to_parquet(partition, output_path, partition_label, partition_id):
    logging.info(f"Writing {partition_label} partition {partition_id} to Parquet...")
    selected_columns = partition[['personuuid', 'claimnum', 'claimrowid']]
    file_name_parquet = f'{partition_label}_partition_{partition_id}.parquet.snappy'
    output_file_path_parquet = os.path.join(output_path, file_name_parquet)
    selected_columns.to_parquet(output_file_path_parquet, compression='snappy')
    logging.info(f"{partition_label} partition {partition_id} written to Parquet.")
 
def write_vectorized_partitions_to_files(vectorizers, df, output_path, partition_label):
    logging.info(f"Writing {partition_label} vectorized partitions to files...")
    dask_tasks = []
    for i, partition in enumerate(df.to_delayed()):
        tfrecord_task = dask.delayed(write_partition_to_tfrecord)(partition, output_path, partition_label, i)
        parquet_task = dask.delayed(write_partition_to_parquet)(partition, output_path, partition_label, i)
        dask_tasks.extend([tfrecord_task, parquet_task])
    dask.compute(*dask_tasks)
    logging.info(f"{partition_label} vectorized partitions written to files successfully.")
 
def process_data(model_dir, df, output_path):
    logging.info("Processing data...")
    vectorizers = load_and_broadcast_vectorizers(model_dir)
   
    if vectorizers is None:
        logging.error("Data processing failed due to missing vectorizers.")
        return
 
    train_df, test_df = df.random_split([0.8, 0.2], random_state=42)
    train_df_vectorized = apply_vectorizers_in_parallel(vectorizers, train_df)
    test_df_vectorized = apply_vectorizers_in_parallel(vectorizers, test_df)
    write_vectorized_partitions_to_files(vectorizers, train_df_vectorized, os.path.join(output_path, 'train'), 'train')
    write_vectorized_partitions_to_files(vectorizers, test_df_vectorized, os.path.join(output_path, 'test'), 'test')
    logging.info("Data processed successfully.")

Error Message:

TypeError: ('Could not serialize object of type StringLookup', '<keras.layers.preprocessing.string_lookup.StringLookup object at 0x7f7f1d79e9a0>')

Given that those are Keras layers, have you tried to make a keras Model with those 2 layers and save it/load it? Just a thought. — Lescurel, Jun 05 '23 at 12:23
so, the last call before error was 'return client.scatter(vectorizers, broadcast=True)' ? — harriet, Jun 10 '23 at 19:34

Applying Tensorflow TextVectorization and StringIndexers to Dask Partitions on Parallel

0 Answers0