Properly setting output shape in transformer network tf.keras not implemented error

Question

I am attempting to create a time distributed transformer model but am encountering a NotImplementedError that I believe is due to not properly setting the output shape (referencing this answer). I have seen similar questions posted but none with working example of how to properly set the output shape within the call function. I also referenced this existing GitHub issue and proposed solution in my code. Reproducible example:

def get_embedding_layer(embedding_dim=100, embedding_matrix=None, max_num_words=10000, max_sent_length=50, reg_param=1e-13):
    l2_reg = tf.keras.regularizers.l2(reg_param)

    if embedding_matrix is not None:
        # Embedding layer initialized with word2vec coefficients
        embedding_layer = tf.keras.layers.Embedding(max_num_words + 1,
                            embedding_dim,
                            input_length=max_sent_length,
                            trainable=True,
                            mask_zero=False,
                            embeddings_regularizer=l2_reg,
                            weights=[embedding_matrix])
    else:
        # Embedding layer with no pre-trained weiths
        embedding_layer = tf.keras.layers.Embedding(max_num_words + 1,
                            embedding_dim,
                            input_length=max_sent_length,
                            trainable=True,
                            mask_zero=False,
                            embeddings_regularizer=l2_reg)

    return embedding_layer

class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = tf.keras.layers.Dense(embed_dim)
        self.key_dense = tf.keras.layers.Dense(embed_dim)
        self.value_dense = tf.keras.layers.Dense(embed_dim)
        self.combine_heads = tf.keras.layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        
        shape = tf.shape(inputs)
        int_shape = inputs.shape.as_list()
        
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        values = self.layernorm2(out1 + ffn_output)

        # attempted to set shape according to this issue 
        # https://github.com/tensorflow/tensorflow/issues/38296#issuecomment-623698709

        values.set_shape(self.compute_output_shape(int_shape))
        return values

def create_model(reg_param=1e-13, embed_dim=100, embedding_matrix=None, gru_units=100, 
               max_sents=12, max_sent_length=50, max_num_words=10000, num_heads=8, 
               ff_dim=1024, rate=0.1):
    
    l2_reg = tf.keras.regularizers.l2(reg_param)

    
    doc_input = tf.keras.layers.Input(shape=(max_sents, max_sent_length,), 
                                      name='doc_input', 
                                      dtype='int32')
    embedding_layer = get_embedding_layer(embedding_dim=embed_dim, 
                                          embedding_matrix=embedding_matrix,
                                          max_num_words=max_num_words, 
                                          max_sent_length=max_sent_length, 
                                          reg_param=reg_param)
    embedded_tps = tf.keras.layers.TimeDistributed(embedding_layer, 
                                                   name='time_distributed_embedding')(doc_input)
    
    transformer_block = TransformerBlock(embed_dim, 
                                         num_heads, 
                                         ff_dim)

    transformer_tps = tf.keras.layers.TimeDistributed(transformer_block, 
                                                      name='time_distributed_transformer')(embedded_tps)
    
    max_pooled_tps = tf.keras.layers.TimeDistributed(tf.keras.layers.GlobalMaxPooling1D(), 
                                                     name='time_distributed_max_poooling')(transformer_tps)

    gru_out = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(gru_units, 
                                                                return_sequences=False, 
                                                                kernel_regularizer=l2_reg, 
                                                                name='doc_bigru'))(max_pooled_tps)
    
    preds = tf.keras.layers.Dense(1, activation='sigmoid', 
                                  kernel_regularizer=l2_reg, 
                                  name='output_pred')(gru_out)

    model = tf.keras.Model(doc_input, preds)
    
    return model

model = create_model(embed_dim=128, 
                     gru_units=256,
                     reg_param=1e-13,
                     max_sents=12, 
                     max_sent_length=512,
                     max_num_words=50000,
                     num_heads=8, 
                     ff_dim=1024, 
                     rate=0.1)

Raises the following error: `NotImplementedError: in user code:

<ipython-input-63-0a712c6c4d37>:24 call  *
    values.set_shape(self.compute_output_shape(int_shape))
/Users/kevinprybol/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:741 compute_output_shape  **
    raise NotImplementedError

NotImplementedError: `

score 0 · Accepted Answer · answered Sep 08 '20 at 20:37

Had the right idea to copy the input shape since they are the same for the transformer layer but can return the output shape using tf.TensorShape().

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def compute_output_shape(self, input_shape):
        shape = tf.TensorShape(input_shape).as_list()
        return tf.TensorShape(shape)

Properly setting output shape in transformer network tf.keras not implemented error

1 Answers1