0

Based on this article: http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/ I am trying to implement a gensim word2vec model with the pretrained vectors of GloVe in a text classification task. However, I would like to do FeatureSelection also in my text data. I tried multiple sequences in the pipeline but i get fast a memory error which points to the transform part of TfidfEmbeddingVectorizer.

   return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X

If I replace the TfidfEmbeddingVectorizer class with a regular TfIdfVectorizer it works properly. Is there a way I could combine SelectFromModel and W2vec in the pipeline?

from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import precision_recall_fscore_support as score, f1_score
import pickle
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC
import gensim
import collections

class ItemSelector(BaseEstimator, TransformerMixin):

    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X):
        return (X[self.column])




class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{'REPORT_M': text}
                for text in posts]


class TfidfEmbeddingVectorizer(object):
  def __init__(self, word2vec):
    self.word2vec = word2vec
    self.word2weight = None
    self.dim = len(word2vec.values())

  def fit(self, X, y):
    tfidf = TfidfVectorizer(analyzer=lambda x: x)
    tfidf.fit(X)
    # if a word was never seen - it must be at least as infrequent
    # as any of the known words - so the default idf is the max of 
    # known idf's
    max_idf = max(tfidf.idf_)
    self.word2weight = collections.defaultdict(
        lambda: max_idf,
        [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

    return self

  def transform(self, X):
    return np.array([
            np.mean([self.word2vec[w] * self.word2weight[w]
                     for w in words if w in self.word2vec] or
                    [np.zeros(self.dim)], axis=0)
            for words in X
        ])


# training model
 def train(data_train, data_val):

    with open("glove.6B/glove.6B.50d.txt", "rb") as lines:
        w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
               for line in lines}
    classifier = Pipeline([
                    ('union', FeatureUnion([

                            ('text', Pipeline([
                                ('selector', ItemSelector(column='TEXT')),
                                ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
                                ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False),threshold=0.01))
                            ])),

                            ('category', Pipeline([
                                ('selector', ItemSelector(column='category')),
                                ('stats', TextStats()),
                                ('vect', DictVectorizer())
                            ])) 
    ])),
                    ('clf',ExtraTreesClassifier(n_estimators=200, max_depth=500, min_samples_split=6, class_weight= 'balanced'))])

    classifier.fit(data_train,data_train.CLASSES)
    predicted = classifier.predict(data_val)
Vas
  • 343
  • 4
  • 18

1 Answers1

0

I think in here self.dim = len(word2vec.values()) you should specify the dimension of the model. If you are using glove.6B.50d.txt, then the dimension should be 50.

len(word2vec.values()) is the total number of words, thus will create a huge matrix, i.e., memory error.

S Liu
  • 1