I created my own custom pipeline for text processing. Inside the .transform()
method, I want to remove the target row if there are no tokens.
class SpacyVectorizer(BaseEstimator, TransformerMixin):
def __init__(
self,
alpha_only: bool = True,
lemmatize: bool = True,
remove_stopwords: bool = True,
case_fold: bool = True,
):
self.alpha_only = alpha_only
self.lemmatize = lemmatize
self.remove_stopwords = remove_stopwords
self.case_fold = case_fold
self.nlp = spacy.load(
name='en_core_web_sm',
disable=["parser", "ner"]
)
def fit(self, X, y=None):
return self
def transform(self, X, y):
# Bag-of-Words matrix
bow_matrix = []
# Iterate over documents in SpaCy pipeline
for i, doc in enumerate(nlp.pipe(X)):
# Words array
words = []
# Tokenize document
for token in doc:
# Remove non-alphanumeric tokens
if self.alpha_only and not token.is_alpha:
continue
# Stopword removal
if self.remove_stopwords and token.is_stop:
continue
# Lemmatization
if self.lemmatize:
token = token.lemma_
# Case folding
if self.case_fold:
token = str(token).casefold()
# Append token to words array
words.append(token)
# Update the Bow representation
if words:
# Preprocessed document
new_doc = ' '.join(words)
# L2-normalized vector of preprocessed document
word_vec = nlp(new_doc).vector
else:
# Remove target label
y.drop(y.index[i], inplace=True)
# Update the BoW matrix
bow_matrix.append(word_vec)
# Return BoW matrix
return bow_matrix
Unfortunately, because I cannot pass the y
vector to the .transform()
method, it does not work.
How can I force the pipeline to pass both X
and y
parameters?
Is there any other workaround on how to do it?
I don't want to pass y
via .fit_transform()
, because test data shouldn't be fitted.