0

I need to understand how to input string values (NSE) in dplyr's group_by function. My data set and code below works fine with "group_by" but does not work with "group_by_" version. I am unable to find my mistake in this regard.

ID,Region,Dimension,BlogsInd.,BlogsNews,BlogsTech,Columns
1,PK,Dim1,-4.75,NA,NA,NA
2,PK,Dim1,-5.69,NA,NA,NA
3,PK,Dim1,-0.27,NA,NA,NA
4,PK,Dim1,-2.76,NA,NA,NA
5,PK,Dim1,-8.24,NA,NA,NA
6,PK,Dim1,-12.51,NA,NA,NA
7,PK,Dim1,-1.28,NA,NA,NA
8,PK,Dim1,0.95,NA,NA,NA
9,PK,Dim1,-5.96,NA,NA,NA
10,PK,Dim1,-8.81,NA,NA,NA
11,PK,Dim1,-8.46,NA,NA,NA
12,PK,Dim1,-6.15,NA,NA,NA
13,PK,Dim1,-13.98,NA,NA,NA
14,PK,Dim1,-16.43,NA,NA,NA
15,PK,Dim1,-4.09,NA,NA,NA
16,PK,Dim1,-11.06,NA,NA,NA
17,PK,Dim1,-9.04,NA,NA,NA
18,PK,Dim1,-8.56,NA,NA,NA
19,PK,Dim1,-8.13,NA,NA,NA
20,PK,Dim2,-14.46,NA,NA,NA
21,PK,Dim2,-4.21,NA,NA,NA
22,PK,Dim2,-4.96,NA,NA,NA
23,PK,Dim2,-5.48,NA,NA,NA
24,PK,Dim2,-4.53,NA,NA,NA
25,PK,Dim2,6.31,NA,NA,NA
26,PK,Dim2,-11.16,NA,NA,NA
27,PK,Dim2,-1.27,NA,NA,NA
28,PK,Dim2,-11.49,NA,NA,NA
29,PK,Dim2,-0.9,NA,NA,NA
30,PK,Dim2,-12.27,NA,NA,NA
31,PK,Dim2,6.85,NA,NA,NA
32,PK,Dim2,-5.21,NA,NA,NA
33,PK,Dim2,-1.06,NA,NA,NA
34,PK,Dim2,-2.6,NA,NA,NA
35,PK,Dim2,-0.95,NA,NA,NA
36,PK,Dim3,-0.82,NA,NA,NA
37,PK,Dim3,-7.65,NA,NA,NA
38,PK,Dim3,0.64,NA,NA,NA
39,PK,Dim3,-2.25,NA,NA,NA
40,PK,Dim3,-1.58,NA,NA,NA
41,PK,Dim3,-5.73,NA,NA,NA
42,PK,Dim3,0.37,NA,NA,NA
43,PK,Dim3,-5.46,NA,NA,NA
44,PK,Dim3,-3.48,NA,NA,NA
45,PK,Dim3,0.88,NA,NA,NA
46,PK,Dim3,-2.11,NA,NA,NA
47,PK,Dim3,-10.13,NA,NA,NA
48,PK,Dim3,-2.08,NA,NA,NA
49,PK,Dim3,-4.33,NA,NA,NA
50,PK,Dim3,1.09,NA,NA,NA
51,PK,Dim3,-4.23,NA,NA,NA
52,PK,Dim3,-1.46,NA,NA,NA
53,PK,Dim3,9.37,NA,NA,NA
54,PK,Dim3,5.84,NA,NA,NA
55,PK,Dim3,8.21,NA,NA,NA
56,PK,Dim3,7.34,NA,NA,NA
57,PK,Dim4,1.83,NA,NA,NA
58,PK,Dim4,14.39,NA,NA,NA
59,PK,Dim4,22.02,NA,NA,NA
60,PK,Dim4,4.83,NA,NA,NA
61,PK,Dim4,-3.24,NA,NA,NA
62,PK,Dim4,-5.69,NA,NA,NA
63,PK,Dim4,-22.92,NA,NA,NA
64,PK,Dim4,0.41,NA,NA,NA
65,PK,Dim4,-4.42,NA,NA,NA
66,PK,Dim4,-10.72,NA,NA,NA
67,PK,Dim4,-11.29,NA,NA,NA
68,PK,Dim4,-2.89,NA,NA,NA
69,PK,Dim4,-7.59,NA,NA,NA
70,PK,Dim4,-7.45,NA,NA,NA
71,US,Dim1,-12.49,NA,NA,NA
72,US,Dim1,-11.59,NA,NA,NA
73,US,Dim1,-4.6,NA,NA,NA
74,US,Dim1,-22.83,NA,NA,NA
75,US,Dim1,-4.83,NA,NA,NA
76,US,Dim1,-14.76,NA,NA,NA
77,US,Dim1,-15.93,NA,NA,NA
78,US,Dim1,-2.78,NA,NA,NA
79,US,Dim1,-16.39,NA,NA,NA
80,US,Dim1,-15.22,NA,NA,NA
81,US,Dim1,3.25,NA,NA,NA
82,US,Dim1,-2.73,NA,NA,NA
83,US,Dim1,0.96,NA,NA,NA
84,US,Dim1,-1.12,NA,NA,NA
85,US,Dim1,-0.33,NA,NA,NA
86,US,Dim1,-6.45,NA,NA,NA
87,US,Dim1,2.52,NA,NA,NA
88,US,Dim1,3.18,NA,NA,NA
89,US,Dim1,4.65,NA,NA,NA
90,US,Dim2,-1.75,NA,NA,NA
91,US,Dim2,-0.22,NA,NA,NA
92,US,Dim2,8.16,NA,NA,NA
93,US,Dim2,1.89,NA,NA,NA
94,US,Dim2,4.31,NA,NA,NA
95,US,Dim2,-0.41,NA,NA,NA
96,US,Dim2,-23.02,NA,NA,NA
97,US,Dim2,3.87,NA,NA,NA
98,US,Dim2,-4.76,NA,NA,NA
99,US,Dim2,4.95,NA,NA,NA
100,US,Dim2,4.78,NA,NA,NA
101,US,Dim2,-15.11,NA,NA,NA
102,US,Dim2,-3.74,NA,NA,NA
103,US,Dim2,-6.15,NA,NA,NA
104,US,Dim2,-8.33,NA,NA,NA
105,US,Dim2,-5.55,NA,NA,NA
106,US,Dim3,-5.1,NA,NA,NA
107,US,Dim3,-0.41,NA,NA,NA
108,US,Dim3,-8,NA,NA,NA
109,US,Dim3,-11.8,NA,NA,NA
110,US,Dim3,-10.39,NA,NA,NA
111,US,Dim3,-14.98,NA,NA,NA
112,US,Dim3,-13.14,NA,NA,NA
113,US,Dim3,-16.06,NA,NA,NA
114,US,Dim3,-16.75,NA,NA,NA
115,US,Dim3,-17.58,NA,NA,NA
116,US,Dim3,-13.12,NA,NA,NA
117,US,Dim3,-15.69,NA,NA,NA
118,US,Dim3,-9.29,NA,NA,NA
119,US,Dim3,-14.93,NA,NA,NA
120,US,Dim3,-18.75,NA,NA,NA
121,US,Dim3,-16.15,NA,NA,NA
122,US,Dim3,-14.38,NA,NA,NA
123,US,Dim3,-11.33,NA,NA,NA
124,US,Dim3,2.06,NA,NA,NA
125,US,Dim3,1.55,NA,NA,NA
126,US,Dim3,3.17,NA,NA,NA
127,US,Dim4,3.33,NA,NA,NA
128,US,Dim4,-3.31,NA,NA,NA
129,US,Dim4,5.67,NA,NA,NA
130,US,Dim4,-1.94,NA,NA,NA
131,US,Dim4,-4.2,NA,NA,NA
132,US,Dim4,-13.53,NA,NA,NA
133,US,Dim4,-10.84,NA,NA,NA
134,US,Dim4,-1.04,NA,NA,NA
135,US,Dim4,-8.02,NA,NA,NA
136,US,Dim4,-14.65,NA,NA,NA
137,US,Dim4,-6.39,NA,NA,NA
138,US,Dim4,-3.69,NA,NA,NA
139,US,Dim4,-11.62,NA,NA,NA
140,US,Dim4,-3.02,NA,NA,NA
141,US,Dim4,-28.84,NA,NA,NA

.

attach(dims_Blog)
d1 <- dims_Blog %>% group_by(Dimension, Region) %>% summarise(mean=mean(BlogsInd., na.rm=TRUE))
d1
Dimension Region       mean
     <fctr> <fctr>      <dbl>
1      Dim1     PK -3.7385551
2      Dim1     US -4.2264179
3      Dim2     PK  1.9985551
4      Dim2     US  1.3509577
5      Dim3     PK  0.8965019
6      Dim3     US  1.5335199
7      Dim4     PK  1.4830672
8      Dim4     US  0.3913806

But the same code with the other version does not work. Where am I wrong?

d1 <- dims_Blog %>% group_by_("Dimension", "Region") %>% summarise_(mean="mean(BlogsInd.)", na.rm=TRUE)
> d1
Source: local data frame [8 x 4]
Groups: Dimension [?]

  Dimension Region  mean na.rm
     <fctr> <fctr> <dbl> <lgl>
1      Dim1     PK    NA  TRUE
2      Dim1     US    NA  TRUE
3      Dim2     PK    NA  TRUE
4      Dim2     US    NA  TRUE
5      Dim3     PK    NA  TRUE
6      Dim3     US    NA  TRUE
7      Dim4     PK    NA  TRUE
8      Dim4     US    NA  TRUE
Axeman
  • 32,068
  • 8
  • 81
  • 94
Shakir
  • 343
  • 5
  • 23
  • What it means is that you can pass the names stored in certain objects instead of the quoted version – akrun Feb 14 '17 at 10:03
  • Yes I understand that. I use it in a function (hence i pass string names of relevant objects to group_by_). But as you can see above, it is not working for me. I need to know where is the mistake in the application of "group_by_". – Shakir Feb 14 '17 at 10:21
  • you may need to use `group_by_(.dots = ` – akrun Feb 14 '17 at 10:25
  • Result remains the same. – Shakir Feb 14 '17 at 10:40
  • Don't use `attach`. Ever. – Axeman Feb 14 '17 at 10:51
  • The `group_by` works fine for me, though you probably want `summarise_(mean = ~mean(BlogsInd., na.rm=TRUE))` or `summarise_(mean = "mean(BlogsInd., na.rm=TRUE)")`. – Axeman Feb 14 '17 at 10:54
  • 1
    Isn't the only problem that your `na.rm` argument is outside of the `mean` call in your second example so it's now created as a separate variable and your mean returns `NA` because there are `NA`'s in your ungrouped data that are now no longer removed? the grouping seems to work fine, since d1 has the same amount of rows in both cases. – Marijn Stevering Feb 14 '17 at 10:56
  • `na.rm=TRUE` outside of the quotes does seems to be the problem. I just didn't know how to put it inside because here it is send as argument to mean i.e. inside brackets. I tried to put it outside like this: `"mean(BlogsInd.), na.rm=TRUE"` – Shakir Feb 14 '17 at 11:01
  • It should be sent to mean, you send it to mean in the first example. – Marijn Stevering Feb 14 '17 at 11:02
  • I already showed the correct code in my comment above? – Axeman Feb 14 '17 at 11:07
  • And it does not work when string is inside a string object like this `x = "Dimension" > y = "BlogsInd." > colour = "Region" > d1 <- dims_Blog %>% group_by_(.dots=x, colour) %>% summarise_(mean = "mean(y, na.rm=TRUE)")` Error: Error in summarise_impl(.data, dots) : object 'y' not found – Shakir Feb 14 '17 at 11:07

1 Answers1

1

The problem is not the nse, but in the na.rm argument. In the first example you send this argument to mean, in the second example it's split off and summarise interprets it as a new variable to be added. By moving the na.rm back into the mean call I get the same results from both methods:

d1 <- dims_Blog %>% group_by(Dimension, Region) %>% summarise(mean=mean(BlogsInd., na.rm=TRUE))
d2 <- dims_Blog %>% group_by_("Dimension", "Region") %>% summarise_(mean="mean(BlogsInd., na.rm=TRUE)")
identical(d1,d2) #Returns TRUE
Marijn Stevering
  • 1,204
  • 10
  • 24
  • the problem is that i send the column names as string objects to the function where this function lies. Please see my last comment above which explains that string passed as `y="BlogsInd."` and then `summarise_(mean="mean(y, na.rm=TRUE)")` gives error of object (y) not found. – Shakir Feb 14 '17 at 11:16
  • That's a different problem, you need to create the string with the variable name in, so `summarise_(mean=paste("mean(",y,", na.rm=TRUE)"))` should work. – Marijn Stevering Feb 14 '17 at 11:59
  • Thanks a lot. That was the solution i was looking for days. R doesn't make it easy. – Shakir Feb 14 '17 at 12:05