1

I am trying to add labels to geom_boxplot for extreme values with dplyr and am getting an inconsistency either with ggplot or dplyr. what am i doing wrong?

#toy exmaple
df=rbind(data.frame(id=rep("1",100),var=paste0("V",seq(1,100)),val=rnorm(100,0,5)),
         data.frame(id=rep("2",100),var=paste0("V",seq(1,100)),val=rnorm(100,0,3))) 

#subset with extreme values
df_bound=df%.%group_by(id)%.%filter(val<quantile(val,.025)|val>quantile(val,.975))

#plot 
ggplot(df,aes(x=id,y=val,fill=id,label=var))+geom_boxplot()+
geom_point(aes(group=id),data=df_bound)+
geom_text(aes(group=id),data=df_bound,hjust=-1,size=4)
yonicd
  • 498
  • 1
  • 4
  • 15
  • 1
    The output looks good to me: what is the inconsistency you are referring to? (If you're referring to the fact that it's drawing points that aren't beyond the whiskers of the plot, that's because you're not calculating those points the same way ggplot2 does) – David Robinson Sep 18 '14 at 10:58
  • You could use `boxplot.stats` instead of `quantile` to identify extreme values. – Roland Sep 18 '14 at 10:59

1 Answers1

2

this is a solution to the problem:

df=rbind(data.frame(id=rep("1",100),var=paste0("V",seq(1,100)),val=rnorm(100,0,5)),
         data.frame(id=rep("2",100),var=paste0("V",seq(1,100)),val=rnorm(100,0,3)))

#new code
df_bound=df%.%group_by(id)%>%do(.,data.frame(val=boxplot.stats(.$val)$out))
df_bound=left_join(df_bound,df,by=c("id","val"))

ggplot(df,aes(x=id,y=val,fill=id,label=var))+geom_boxplot()+
geom_point(aes(group=id),data=df_bound)+
geom_text(aes(group=id),data=df_bound,hjust=-1,size=4)
yonicd
  • 498
  • 1
  • 4
  • 15