I tried to tokenized my sentences to generate relevant input for my RoBERTa model, but when i try to tokenize my sentences which in a form of np array, it won't tokenize my sentences and reslting in following error. TypeError: 'float' object is not iterable
Here's are my code
def roberta_encode(texts, tokenizer):
ct = len(texts)
input_ids = np.ones((ct, MAX_LEN), dtype='int32')
attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification
for k, text in enumerate(texts):
# Tokenize
tok_text = tokenizer.tokenize(texts[k])
# Truncate and convert tokens to numerical IDs
enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
input_length = len(enc_text) + 2
input_length = input_length if input_length < MAX_LEN else MAX_LEN
# Add tokens [CLS] and [SEP] at the beginning and the end
input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
# Set to 1s in the attention input
attention_mask[k,:input_length] = 1
return {
'input_word_ids': input_ids,
'input_mask': attention_mask,
'input_type_ids': token_type_ids
}
and here's my sentences
array(['it would be nice kalik ye kl situasi sy kek gin misale',
'page of coba sahabat adan moga ratu orang',
'bandung hiden gems inap lokasi kaki gunung nginep cocok banget ngerefresh otak d',
..., 'liat wendy nyanyi psycho smtownlive', 'mongo bikin atur dw',
'rebus aja jd rebu ful package gin mah jd gocap wkwkwkwkwk'],
dtype=object)
i tried to changed my data from numpy back to pandas df but still didn't work