0

I encountered an odd issue when I try to use tf-idf on my corpus.

Here is my code:

prep_fun <- function(x) {
  x %>% 
    # make text lower case
    str_to_lower %>% 
    # remove non-alphanumeric symbols
    str_replace_all("<.*?>", " ") %>%
    str_replace_all("[^a-zA-Z0-9[:punct:]]", " ") %>%
    str_replace_all("(f|ht)tp(s?)://(.*)[.][a-z]+", " ") %>% 
    str_replace_all("\\(", " ") %>% 
    str_replace_all("\\)", " ") %>% 
    str_replace_all("§", " ") %>% 
    str_replace_all(" \\. ", " ") %>% 
    str_replace_all("[\\.;:,+-] ", " ") %>% 
    str_replace_all("/", " ") %>%  
    #remove tags
    #remove standalone numbers
    str_replace_all("\\s*(?<!\\B|-)\\d+(?!\\B|-)\\s*", " ") %>%
    # collapse multiple spaces
    str_replace_all("\\s+", " ")
}
stem_tokenizer = function(x) {
  word_tokenizer(x) %>% lapply( function(x) SnowballC::wordStem(x, language="en"))
}

it = itoken(as.character(train$text), 
            preprocessor = prep_fun,
            progressbar = T,
            tokenizer = stem_tokenizer,
            ids = train$id)

v = create_vocabulary(it, stopwords = c(stopwords("english"), stopwords("SMART"))) %>% prune_vocabulary(term_count_min = 3)
vectorizer = vocab_vectorizer(v)

dtm = create_dtm(it, vectorizer)

tfidf = TfIdf$new()

dtm_train_tfidf = fit_transform(dtm, tfidf)

When I run it, it fails at the fit_transform part with the following message:

'names' attribute [90214] must be the same length as the vector [10]

Did anyone encounter issue like that before?

Thanks!

Update: I did the same thing with movie review dataset:

it <- itoken(movie_review$review, prep_fun, stem_tokenizer, ids = movie_review$id)
v = create_vocabulary(it, stopwords = c(stopwords("english"), stopwords("SMART"))) %>% prune_vocabulary(term_count_min = 3)

vectorizer = vocab_vectorizer(v)

dtm = create_dtm(it, vectorizer)

tfidf = TfIdf$new()

dtm_train_tfidf = fit_transform(dtm, tfidf)
And I still have the same error:

Error in .local(x, na.rm, dims, ...) : 'names' attribute [5000] must be the same length as the vector [10]

Zakkery
  • 420
  • 4
  • 11

0 Answers0