I need help with the transform function. What i want to do is scramble tokens/words of a sentence and input them only to the encoder but for the decoder i want the same unscrambled tokens/words as input. I think i can't figure out the for loop thing? Please help.
'''
This is the function to scramble words in a sentence.
def scramble(text):
words = text.split()
random.shuffle(words)
return ' '.join(words)
This functions tokenizes the text/sentences.
def tokenize(text):
tokens = [re.sub(REMOVE_CHARS, '', token)
for token in re.split("[-\n ]", text)]
return tokens
This functions introduces random spelling errors in a word of each sentence.
def add_speling_erors(tokn, error_rate):
"""Simulate some artificial spelling mistakes."""
assert(0.0 <= error_rate < 1.0)
if len(tokn) < 3:
return tokn
rand = np.random.rand()
# Here are 4 different ways spelling mistakes can occur,
# each of which has equal chance.
prob = error_rate / 4.0
if rand < prob:
# Replace a character with a random character.
random_char_index = np.random.randint(len(tokn))
tokn = tokn[:random_char_index] + np.random.choice(CHARS) \
+ tokn[random_char_index + 1:]
elif prob < rand < prob * 2:
# Delete a character.
random_char_index = np.random.randint(len(tokn))
tokn = tokn[:random_char_index] + tokn[random_char_index + 1:]
elif prob * 2 < rand < prob * 3:
# Add a random character.
random_char_index = np.random.randint(len(tokn))
tokn = tokn[:random_char_index] + np.random.choice(CHARS) \
+ tokn[random_char_index:]
elif prob * 3 < rand < prob * 4:
# Transpose 2 characters.
random_char_index = np.random.randint(len(tokn) - 1)
tokn = tokn[:random_char_index] + tokn[random_char_index + 1] \
+ tokn[random_char_index] + tokn[random_char_index + 2:]
else:
# No spelling errors.
pass
return tokn
This function tranforms the input tokens to encoder, decoder and target tokens.
def transform(tokens, maxlen, error_rate=0.3, shuffle=True):
if shuffle:
print('Shuffling data.')
np.random.shuffle(tokens)
encoder_tokens = []
decoder_tokens = []
target_tokens = []
for token in tokens:
text = TreebankWordDetokenizer().detokenize(tokens)
text = scramble (text)
tokens = tokenize (text)
encoder = add_speling_erors(token, error_rate=error_rate)
encoder += EOS * (maxlen - len(encoder)) # Padded to maxlen.
encoder_tokens.append(encoder)
decoder = SOS + token
decoder += EOS * (maxlen - len(decoder))
decoder_tokens.append(decoder)
target = decoder[1:]
target += EOS * (maxlen - len(target))
target_tokens.append(target)
assert(len(encoder) == len(decoder) == len(target))
return encoder_tokens, decoder_tokens, target_tokens
'''