4

I want to see whether the text column has elements outside the specified values of "a" and "b"

specified_value=c("a","b")

df=data.frame(key=c(1,2,3,4),text=c("a,b,c","a,d","1,2","a,b")
df_out=data.frame(key=c(1,2,3),text=c("c","d","1,2",NA))

This is what I have tried:

df=df%>%mutate(text_vector=strsplit(text, split=","), 
extra=text_vector[which(!text_vector %in% specified_value)])

But this doesn't work, any suggestions?

Ashti
  • 193
  • 1
  • 10

4 Answers4

2

We can split the 'text' by the delimiter , with separate_rows, grouped by 'key', get the elements that are not in 'specified_value' with setdiff and paste them together (toString), then do a join to get the other columns in the original dataset

library(dplyr) # >= 1.0.0
library(tidyr)
df %>%
    separate_rows(text) %>%
    group_by(key) %>% 
    summarise(extra = toString(setdiff(text, specified_value))) %>%
    left_join(df) %>%
    mutate(extra = na_if(extra, ""))
# A tibble: 4 x 3
#    key extra text 
#  <dbl> <chr> <chr>
#1     1 c     a,b,c
#2     2 d     a,d  
#3     3 1, 2  1,2  
#4     4 <NA>  a,b  
akrun
  • 874,273
  • 37
  • 540
  • 662
1

Using setdiff.

df$outside <- sapply({
  x <- lapply(strsplit(df$text, ","), setdiff, specified_value)
  replace(x, lengths(x) == 0, NA)}, 
  paste, collapse=",")
df
#   key  text outside
# 1   1 a,b,c       c
# 2   2   a,d       d
# 3   3   1,2     1,2
# 4   4   a,b      NA

Data:

df <- structure(list(key = c(1, 2, 3, 4), text = c("a,b,c", "a,d", 
"1,2", "a,b")), class = "data.frame", row.names = c(NA, -4L))

specified_value <- c("a", "b")
jay.sf
  • 60,139
  • 8
  • 53
  • 110
0

use stringi::stri_split_fixed

library(stringi)
!all(stri_split_fixed("a,b", ",", simplify=T)  %in% specified_value) #FALSE
!all(stri_split_fixed("a,b,c", ",", simplify=T)  %in% specified_value) #TRUE
Ric
  • 5,362
  • 1
  • 10
  • 23
0

An option using regex without splitting the data on comma :

#Collapse the specified_value in one string and remove from text 
df$text1 <- gsub(paste0(specified_value, collapse = "|"), '', df$text)
#Remove extra commas
df$text1 <- gsub('(?<![a-z0-9]),', '', df$text1, perl = TRUE)
df
#  key  text text1
#1   1 a,b,c     c
#2   2   a,d     d
#3   3   1,2   1,2
#4   4   a,b      
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213