Create logical list with strsplit on combined words to subset data frame

Question

I have tried to subset my data frame according a condition on specific column. For this purpose I need to create TRUE or FALSE info for each line on this column. But some line on this column has combine words and my code can not detect them.

p <- sapply(strsplit(test$hashtags, split=","), function(x)any(x%in%"evet"))

When you check the sample data you can easily see that line 5,7,8 have specific word but they are showed as a FALSE.

I have tried to add "unlist" command in my code but it haven't worked for me.

p <- sapply(unlist(strsplit(test$hashtags, split=",")), function(x)any(x%in%"evet"))

I need to create one FALSE or TRUE condition according specific word for combined line even though there are more than one words. Thanks for in advance.

Sample Data:

test <- structure(list(created_at = structure(c(1489636860, 1489636860, 
1489636860, 1489636860, 1489636860, 1489636860, 1489636860, 1489636860, 
1489636860, 1489636860), class = c("POSIXct", "POSIXt"), tzone = "GMT"), 
    user.screen_name = c("bilge_bilir", "memetozturk93", "Byomeraslan", 
    "tmremolar", "orhanyilmaz_77", "tamdere", "EriVatan", "BaySancaktar", 
    "zeynepmekik", "EriVatan"), entities.hashtags = list(structure(list(
        indices = list(c(84L, 90L)), text = "Hayır"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(65L, 70L)), text = "evet"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(98L, 103L)), text = "Evet"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(98L, 104L)), text = "Hayır"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(28L, 33L), c(45L, 50L), c(89L, 94L)), 
        text = c("EVET", "EVET", "EVET")), .Names = c("indices", 
    "text"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
        indices = list(c(38L, 43L)), text = "EVET"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(20L, 29L), c(36L, 46L), c(89L, 94L)), 
        text = c("Dirilişe", "Yükselişe", "Evet")), .Names = c("indices", 
    "text"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
        indices = list(c(10L, 15L), c(16L, 20L), c(21L, 26L), 
            c(27L, 31L)), text = c("Evet", "Eri", "Beli", "Yes"
        )), .Names = c("indices", "text"), class = "data.frame", row.names = c(NA, 
    4L)), structure(list(indices = list(c(125L, 130L)), text = "Evet"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L), structure(list(
        indices = list(c(102L, 107L)), text = "EVET"), .Names = c("indices", 
    "text"), class = "data.frame", row.names = 1L)), retweeted_status.created_at = c("Thu Mar 16 03:49:15 +0000 2017", 
    "Wed Mar 15 23:57:44 +0000 2017", "Wed Mar 15 21:07:54 +0000 2017", 
    "Wed Mar 15 20:54:43 +0000 2017", "Wed Mar 15 14:41:15 +0000 2017", 
    "Wed Mar 15 23:07:43 +0000 2017", "Wed Mar 15 15:41:06 +0000 2017", 
    NA, "Wed Mar 15 11:13:15 +0000 2017", "Wed Mar 15 16:37:13 +0000 2017"
    ), entities.user_mentions = list(structure(list(indices = list(
        c(3L, 16L), c(18L, 30L), c(44L, 55L), c(56L, 71L), c(72L, 
        83L)), screen_name = c("seremgiz8289", "bilge_bilir", 
    "OduncuTimi", "yalcinvelioglu", "OPTlMlst_Z"), id = c(301944248, 
    2189106581, 2756465282, 2668851081, 2734161237), id_str = c("301944248", 
    "2189106581", "2756465282", "2668851081", "2734161237"), 
        name = c("ATA KIZI HAYIR DİYOR", "Bilge Eryuz", "OduncuTimi ®", 
        "Yalçın Velioğlu", "OPTlMlst_Z")), .Names = c("indices", 
    "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = c(NA, 
    5L)), structure(list(indices = list(c(3L, 16L)), screen_name = "kendimce_ben", 
        id = 2322523731, id_str = "2322523731", name = "İzzet#EVET/\U0001f1f9\U0001f1f7"), .Names = c("indices", 
    "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 12L)), screen_name = "omrolcay", 
            id = 360420809L, id_str = "360420809", name = "Ömer Olcay"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 18L)), screen_name = "mehmet_asassoy", 
            id = 3151503430, id_str = "3151503430", name = "Mehmet Asassoy"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 17L), c(120L, 132L
        )), screen_name = c("sevincbeykent", "yigitbulutt"), 
            id = c(538364458L, 256065299L), id_str = c("538364458", 
            "256065299"), name = c("Sevinç", "YİĞİT BULUT"
            )), .Names = c("indices", "screen_name", "id", "id_str", 
        "name"), class = "data.frame", row.names = 1:2), structure(list(
            indices = list(c(3L, 13L)), screen_name = "AKsamet54", 
            id = 313205928L, id_str = "313205928", name = "Samet ÇELİK"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 18L)), screen_name = "HayataTebessum", 
            id = 2911157237, id_str = "2911157237", name = "Meryem"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(0L, 9L)), screen_name = "4qet1dil", 
            id = 536676261L, id_str = "536676261", name = "KerenGo"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 18L)), screen_name = "akkadinantalya", 
            id = 1898504755L, id_str = "1898504755", name = "AK Kadın Antalya"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L), 
        structure(list(indices = list(c(3L, 15L)), screen_name = "menes__2010", 
            id = 186968367L, id_str = "186968367", name = "#EVET☪ ياسين ☝"), .Names = c("indices", 
        "screen_name", "id", "id_str", "name"), class = "data.frame", row.names = 1L)), 
    hashtags = c("hayir", "evet", "evet", "hayir", "c(\"evet\", \"evet\", \"evet\")", 
    "evet", "c(\"dirilise\", \"yukselise\", \"evet\")", "c(\"evet\", \"eri\", \"beli\", \"yes\")", 
    "evet", "evet"), mentions = list(c("seremgiz8289", "bilge_bilir", 
    "OduncuTimi", "yalcinvelioglu", "OPTlMlst_Z"), "kendimce_ben", 
        "omrolcay", "mehmet_asassoy", c("sevincbeykent", "yigitbulutt"
        ), "AKsamet54", "HayataTebessum", "4qet1dil", "akkadinantalya", 
        "menes__2010")), .Names = c("created_at", "user.screen_name", 
"entities.hashtags", "retweeted_status.created_at", "entities.user_mentions", 
"hashtags", "mentions"), row.names = c(NA, 10L), class = "data.frame")

Ronak Shah · Accepted Answer · 2018-12-31T07:01:55.523

1

That is mostly because the way hashtags column was generated. It was stored as a list of character vector and when coerced to character it gave this structure.

See for example,

list(c("A", "B", "C"))
#[[1]]
#[1] "A" "B" "C" 

as.character(list(c("A", "B", "C"))) 
#[1] "c(\"A\", \"B\", \"C\")"

Checking an individual element on your dataframe gives the same structure.

test$hashtags[5]
#[1] "c(\"evet\", \"evet\", \"evet\")"

So if there is no way you could go back and change the way hashtags columns was generated you can use grepl instead and it would save you from strsplit and sapply call as well.

grepl("evet", test$hashtags)
#[1] FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

edited Dec 31 '18 at 07:01

answered Dec 31 '18 at 06:24

Ronak Shah

377,200
20
156
213

Thanks Ronak Shah. It works for only one word but I have a few words list object so how can I improve it for list object. My function like this find_sides <- function(side_hashtag, side_list){ sapply(strsplit(side_hashtag, split=","), function(x)any(x%in%side_list)) } ..... I have changed into this but it did't work. find_sides <- function(side_hashtag, side_list){ sapply(strsplit(side_hashtag, split=","), function(x)any(grepl(sidelist, x))) } . Thanks – eabanoz Dec 31 '18 at 07:15
@eabanoz but that updated function does work for your sample data , right? – Ronak Shah Dec 31 '18 at 07:25
no it didn't work. It gives "Warning messages: 1: In grepl(sidelist, x) : argument 'pattern' has length > 1 and only the first element will be used". As you can see it only use the first word so it creates same data frame for each word. – eabanoz Dec 31 '18 at 07:52
@eabanoz If you have `sidelist` as character vector, can you try `sapply(strsplit(side_hashtag, split=","), function(x)any(grepl(paste0(sidelist, collapse = "|"), x))) ` . In the given example it has only one pattern so, `sapply(strsplit(test$hashtags, split=","), function(x) any(grepl("evet", x)))` seems to work. – Ronak Shah Dec 31 '18 at 07:58

score 1 · Answer 2 · answered Dec 31 '18 at 06:25

I would use grepl here:

p <- sapply(strsplit(test$hashtags, split=","), function(x) {
    grepl("evet", x)
})

If you really wanted to match the standalone word evet, then use word boundaries:

p <- sapply(strsplit(test$hashtags, split=","), function(x) {
    grepl("\bevet\b", x)
})

score 0 · Answer 3 · answered Dec 31 '18 at 07:09

We can create a logical index column with str_detect

library(tidyverse)
out <- test %>% 
          mutate(ind = str_detect(hashtags, pattern = "evet")) 
out$ind
#[1] FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE

If we need to get the logical index for each word

test %>% 
   mutate(ind = str_extract_all(hashtags, "\\w+") %>%
                             map(str_detect, pattern = "evet"))

Create logical list with strsplit on combined words to subset data frame

3 Answers3