0

I have 2 data frames which needs to compare df_1 to df_2 and get similar string from df_2 of col_2 and store their number of phrases matched in df_out data frame

col_1 = c("inside the world,worldwide web,google chrome app","world health organisation, from country", "team work","size of the country, bigger compared to other")
df_1 = data.frame(col_1)


col_2 = c("team work,in the company", "size of the country","inside the world,worldwide web,google chrome app",     "google chrome app,worldwide web,inside the world","inside the world,google chrome app",  "web worldwide","world health organisation, from country","from country",
          "size of the country, bigger compared to other","country from", "world health organisation,country from")
df_2 = data.frame(col_2)


col_1 = c("inside the world,worldwide web,google chrome app","inside the world,worldwide web,google chrome app",          "inside the world,worldwide web,google chrome app","inside the world,worldwide web,google chrome app",          "world health organisation, from country","world health organisation, from country",          "world health organisation, from country","world health organisation, from country",
          "team work","size of the country, bigger compared to other","size of the country, bigger compared to other",          "inside the world,worldwide web,google chrome app", "world health organisation, from country")

col_2 = c("inside the world,worldwide web,google chrome app","inside the world,google chrome app",
          "google chrome app,worldwide web,inside the world", "web worldwide",          "world health organisation, from country","from country", "country from",          "world health organisation,country from","team work,in the company",          "size of the country, bigger compared to other","size of the country","team work,in the company",          "web worldwide")
match_percentage = c("1/1","2/3","3/3","1/3","2/2","1/2","1/1","2/2","1/1","2/2","1/2","0/3","0/2")
match_numeric_percentage = c(100,66.666,100,33.3333,100,50,100,100,100,100,50,0,0
                             )
df_out =  data.frame(col_1,col_2,match_percentage,match_numeric_percentage)

Explanation of df_out dataframe
enter image description here

san1
  • 455
  • 2
  • 11
  • "Explanation of df_out dataframe" is expected output? if not could you provide expected output? – kashiff007 Oct 01 '21 at 09:06
  • Thanks, expected output data frame is "df_out" and how to achieve the output is explained in the table – san1 Oct 01 '21 at 09:11
  • In df_out - match_percentage tells how many phrases matching with the df_1$col_1 from df_2$col_2.. for ex: row_1 phrase separeated by comma completely matches (3/3) and row_4 only one string matches from 3 (1/3) – san1 Oct 01 '21 at 09:14

1 Answers1

0

Try this:

df_temp <- data.frame(col_1,col_2)
df_out <- df_temp %>% select(col_1, col_2) %>% mutate(perc = ((mapply(function(x, y) length(intersect(x, y)), 
        strsplit(df_temp$col_1, ','), strsplit(df_temp$col_2, ','))) / (mapply(function(x) length((x)), strsplit(df_temp$col_1, ',')))*100), 
        intersect = (mapply(function(x, y) length(intersect(x, y)), strsplit(df_temp$col_1, ','), strsplit(df_temp$col_2, ','))) ,  n_col_1 = (mapply(function(x) length((x)), strsplit(df_temp$col_1, ','))))

df_out:

                                              col_1                                            col_2      perc intersect n_col_1
1  inside the world,worldwide web,google chrome app inside the world,worldwide web,google chrome app 100.00000         3       3
2  inside the world,worldwide web,google chrome app               inside the world,google chrome app  66.66667         2       3
3  inside the world,worldwide web,google chrome app google chrome app,worldwide web,inside the world 100.00000         3       3
4  inside the world,worldwide web,google chrome app                                    worldwide web  33.33333         1       3
5           world health organisation, from country          world health organisation, from country 100.00000         2       2
6           world health organisation, from country                                     from country   0.00000         0       2
7           world health organisation, from country                                     from country   0.00000         0       2
8           world health organisation, from country           world health organisation,from country  50.00000         1       2
9                                         team work                         team work,in the company 100.00000         1       1
10    size of the country, bigger compared to other    size of the country, bigger compared to other 100.00000         2       2
11    size of the country, bigger compared to other                              size of the country  50.00000         1       2
12 inside the world,worldwide web,google chrome app                         team work,in the company   0.00000         0       3
13          world health organisation, from country                                    worldwide web   0.00000         0       2
kashiff007
  • 376
  • 2
  • 12