I am attempting to create a time distributed transformer model but am encountering a NotImplementedError
that I believe is due to not properly setting the output shape (referencing this answer). I have seen similar questions posted but none with working example of how to properly set the output shape within the call
function. I also referenced this existing GitHub issue and proposed solution in my code. Reproducible example:
def get_embedding_layer(embedding_dim=100, embedding_matrix=None, max_num_words=10000, max_sent_length=50, reg_param=1e-13):
l2_reg = tf.keras.regularizers.l2(reg_param)
if embedding_matrix is not None:
# Embedding layer initialized with word2vec coefficients
embedding_layer = tf.keras.layers.Embedding(max_num_words + 1,
embedding_dim,
input_length=max_sent_length,
trainable=True,
mask_zero=False,
embeddings_regularizer=l2_reg,
weights=[embedding_matrix])
else:
# Embedding layer with no pre-trained weiths
embedding_layer = tf.keras.layers.Embedding(max_num_words + 1,
embedding_dim,
input_length=max_sent_length,
trainable=True,
mask_zero=False,
embeddings_regularizer=l2_reg)
return embedding_layer
class MultiHeadSelfAttention(tf.keras.layers.Layer):
def __init__(self, embed_dim, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
)
self.projection_dim = embed_dim // num_heads
self.query_dense = tf.keras.layers.Dense(embed_dim)
self.key_dense = tf.keras.layers.Dense(embed_dim)
self.value_dense = tf.keras.layers.Dense(embed_dim)
self.combine_heads = tf.keras.layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
# x.shape = [batch_size, seq_len, embedding_dim]
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
query = self.separate_heads(
query, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
key = self.separate_heads(
key, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
value = self.separate_heads(
value, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) # (batch_size, seq_len, num_heads, projection_dim)
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) # (batch_size, seq_len, embed_dim)
output = self.combine_heads(
concat_attention
) # (batch_size, seq_len, embed_dim)
return output
class TransformerBlock(tf.keras.layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = tf.keras.Sequential(
[tf.keras.layers.Dense(ff_dim, activation="relu"), tf.keras.layers.Dense(embed_dim),]
)
self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = tf.keras.layers.Dropout(rate)
self.dropout2 = tf.keras.layers.Dropout(rate)
def call(self, inputs, training):
shape = tf.shape(inputs)
int_shape = inputs.shape.as_list()
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
values = self.layernorm2(out1 + ffn_output)
# attempted to set shape according to this issue
# https://github.com/tensorflow/tensorflow/issues/38296#issuecomment-623698709
values.set_shape(self.compute_output_shape(int_shape))
return values
def create_model(reg_param=1e-13, embed_dim=100, embedding_matrix=None, gru_units=100,
max_sents=12, max_sent_length=50, max_num_words=10000, num_heads=8,
ff_dim=1024, rate=0.1):
l2_reg = tf.keras.regularizers.l2(reg_param)
doc_input = tf.keras.layers.Input(shape=(max_sents, max_sent_length,),
name='doc_input',
dtype='int32')
embedding_layer = get_embedding_layer(embedding_dim=embed_dim,
embedding_matrix=embedding_matrix,
max_num_words=max_num_words,
max_sent_length=max_sent_length,
reg_param=reg_param)
embedded_tps = tf.keras.layers.TimeDistributed(embedding_layer,
name='time_distributed_embedding')(doc_input)
transformer_block = TransformerBlock(embed_dim,
num_heads,
ff_dim)
transformer_tps = tf.keras.layers.TimeDistributed(transformer_block,
name='time_distributed_transformer')(embedded_tps)
max_pooled_tps = tf.keras.layers.TimeDistributed(tf.keras.layers.GlobalMaxPooling1D(),
name='time_distributed_max_poooling')(transformer_tps)
gru_out = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(gru_units,
return_sequences=False,
kernel_regularizer=l2_reg,
name='doc_bigru'))(max_pooled_tps)
preds = tf.keras.layers.Dense(1, activation='sigmoid',
kernel_regularizer=l2_reg,
name='output_pred')(gru_out)
model = tf.keras.Model(doc_input, preds)
return model
model = create_model(embed_dim=128,
gru_units=256,
reg_param=1e-13,
max_sents=12,
max_sent_length=512,
max_num_words=50000,
num_heads=8,
ff_dim=1024,
rate=0.1)
Raises the following error: `NotImplementedError: in user code:
<ipython-input-63-0a712c6c4d37>:24 call *
values.set_shape(self.compute_output_shape(int_shape))
/Users/kevinprybol/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:741 compute_output_shape **
raise NotImplementedError
NotImplementedError: `