I encounter some problems when trying to tokenize using distilBert. I use Jupyter Notebook.
Here's my full code
maxlen = 50
#tokens
maxqnans = np.int((maxlen-20)/2)
corpus_tokenized = ["[CLS] "+
" ".join(tokenizer.tokenize(re.sub(r'[^\w\s]+|\n', '',
str(txt).lower().strip()))[:maxqnans])+
" [SEP] " for txt in corpus]
#masks
masks = [[1]*len(txt.split(" ")) + [0]*(maxlen - len(
txt.split(" "))) for txt in corpus_tokenized]
#idx
idx = [tokenizer.encode(seq.split(" ")) for seq in txt_seq]
# padding
txt_seq = [txt + " [PAD]"*(maxlen-len(txt.split(" "))) if len(txt.split(" ")) != maxlen else txt for txt in corpus_tokenized]
#segments
segments = []
for seq in txt_seq:
temp, i = [], 0
for token in seq.split(" "):
temp.append(i)
if token == "[SEP]":
i += 1
segments.append(temp)
#vector
X_train = [np.asarray(idx, dtype='int32'),
np.asarray(masks, dtype='int32'),
np.asarray(segments, dtype='int32')]
It is said that the problem is in this line of code:
idx = [tokenizer.encode(seq.split(" ")) for seq in txt_seq]
I'm getting the following error:
TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
Can anybody help me with this? Thank you!