0

I need help with the transform function. What i want to do is scramble tokens/words of a sentence and input them only to the encoder but for the decoder i want the same unscrambled tokens/words as input. I think i can't figure out the for loop thing? Please help.

'''

This is the function to scramble words in a sentence.

def scramble(text):
    words = text.split()
    random.shuffle(words)
    return ' '.join(words)

This functions tokenizes the text/sentences.

def tokenize(text):
    tokens = [re.sub(REMOVE_CHARS, '', token)
              for token in re.split("[-\n ]", text)]
    return tokens

This functions introduces random spelling errors in a word of each sentence.

def add_speling_erors(tokn, error_rate):
    """Simulate some artificial spelling mistakes."""
    assert(0.0 <= error_rate < 1.0)
    if len(tokn) < 3:
        return tokn
    rand = np.random.rand()
    # Here are 4 different ways spelling mistakes can occur,
    # each of which has equal chance.
    prob = error_rate / 4.0
    if rand < prob:
        # Replace a character with a random character.
        random_char_index = np.random.randint(len(tokn))
        tokn = tokn[:random_char_index] + np.random.choice(CHARS) \
                + tokn[random_char_index + 1:]
    elif prob < rand < prob * 2:
        # Delete a character.
        random_char_index = np.random.randint(len(tokn))
        tokn = tokn[:random_char_index] + tokn[random_char_index + 1:]
    elif prob * 2 < rand < prob * 3:
        # Add a random character.
        random_char_index = np.random.randint(len(tokn))
        tokn = tokn[:random_char_index] + np.random.choice(CHARS) \
                + tokn[random_char_index:]
    elif prob * 3 < rand < prob * 4:
        # Transpose 2 characters.
        random_char_index = np.random.randint(len(tokn) - 1)
        tokn = tokn[:random_char_index]  + tokn[random_char_index + 1] \
                + tokn[random_char_index] + tokn[random_char_index + 2:]
    else:
        # No spelling errors.
        pass
    return tokn

This function tranforms the input tokens to encoder, decoder and target tokens.

    def transform(tokens, maxlen, error_rate=0.3, shuffle=True):

        if shuffle:
            print('Shuffling data.')
            np.random.shuffle(tokens)

        encoder_tokens = []
        decoder_tokens = []
        target_tokens = []        
        for token in tokens:

            text = TreebankWordDetokenizer().detokenize(tokens)
            text = scramble (text)
            tokens = tokenize (text)

            encoder = add_speling_erors(token, error_rate=error_rate)
            encoder += EOS * (maxlen - len(encoder)) # Padded to maxlen.
            encoder_tokens.append(encoder)

            decoder = SOS + token
            decoder += EOS * (maxlen - len(decoder))
            decoder_tokens.append(decoder)

            target = decoder[1:]
            target += EOS * (maxlen - len(target))
            target_tokens.append(target)

            assert(len(encoder) == len(decoder) == len(target))
        return encoder_tokens, decoder_tokens, target_tokens

'''
OverDose
  • 13
  • 2

0 Answers0