If your network is an old-fashioned encoder-decoder model (without attention), then, as @Prune said, it has memory bottleneck (encoder dimensionality). Thus, such a network cannot learn to reverse strings of arbitrary size. However, you can train such an RNN to reverse strings of limited size. For example, the following toy seq2seq LSTM is able to reverse sequences of digits with length up to 10. Here is how you train it:
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
import numpy as np
emb_dim = 20
latent_dim = 100 # Latent dimensionality of the encoding space.
vocab_size = 12 # digits 0-9, 10 is for start token, 11 for end token
encoder_inputs = Input(shape=(None, ), name='enc_inp')
common_emb = Embedding(input_dim=vocab_size, output_dim=emb_dim)
encoder_emb = common_emb(encoder_inputs)
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_emb)
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(None,), name='dec_inp')
decoder_emb = common_emb(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
def generate_batch(length=4, batch_size=64):
x = np.random.randint(low=0, high=10, size=(batch_size, length))
y = x[:, ::-1]
start = np.ones((batch_size, 1), dtype=int) * 10
end = np.ones((batch_size, 1), dtype=int) * 11
enc_x = np.concatenate([start, x], axis=1)
dec_x = np.concatenate([start, y], axis=1)
dec_y = np.concatenate([y, end], axis=1)
dec_y_onehot = np.zeros(shape=(batch_size, length+1, vocab_size), dtype=int)
for row in range(batch_size):
for col in range(length+1):
dec_y_onehot[row, col, dec_y[row, col]] = 1
return [enc_x, dec_x], dec_y_onehot
def generate_batches(batch_size=64, max_length=10):
while True:
length = np.random.randint(low=1, high=max_length)
yield generate_batch(length=length, batch_size=batch_size)
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['categorical_accuracy'])
model.fit_generator(generate_batches(), steps_per_epoch=1000, epochs=20)
Now you can apply it to reverse a sequence (my decoder is very inefficient, but it does illustrate the principle)
input_seq = np.array([[10, 2, 1, 2, 8, 5, 0, 6]])
result = np.array([[10]])
next_digit = -1
for i in range(100):
next_digit = model.predict([input_seq, result])[0][-1].argmax()
if next_digit == 11:
break
result = np.concatenate([result, [[next_digit]]], axis=1)
print(result[0][1:])
Hoorray, it prints [6 0 5 8 2 1 2]
!
Generally, you can think of such a model as a weird autoencoder (with a reversal side-effect), and choose architecture and training procedure suitable for autoencoders. And there is quite a vast literature about text autoencoders.
Moreover, if you make an encoder-decoder model with attention, then, it will have no memory bottleneck, so, in principle, it is possible to reverse a sequence of any length with a neural network. However, attention requires quadratic computational time, so in practice even neural networks with attention will be very inefficient for long sequences.