26

Here is dataset:

   set.seed(123)
    myd <- data.frame (class = rep(1:4, each = 100), yvar = rnorm(400, 50,30))
    require(ggplot2)
    m <- ggplot(myd, aes(x = yvar))
    p <- m + geom_histogram(colour = "grey40", fill = "grey40", binwidth = 10)  +
       facet_wrap(~class) + theme_bw( ) 
    p + opts(panel.margin=unit(0 ,"lines"))

I want to add labels to bars which each subject class fall into and produce something like the post-powerpoint processed graph. Is there way to do this within R ? ......

Edit: we can think of different pointer such as dot or error bar, if arrow is not impossible

enter image description here

Let's say the following is subjects to be labelled:

class   name       yvar
2       subject4    104.0
3       subject3    8.5
3       subject1    80.0
4       subject2    40.0
4       subject1    115.0

classd <- data.frame (class = c(2,3,3,4,4), 
 name = c  ("subject4", "subject3", "subject1", "subject2", "subject1"), 
 yvar = c(104.0, 8.5,80.0,40.0, 115.0))
jon
  • 11,186
  • 19
  • 80
  • 132
  • You can certainly add arrows like those from within ggplot, but you'll have to be more specific. Can you provide some sample data on the subjects that would allow you to determine which bar they'd fall into? – joran Jun 20 '12 at 16:42
  • @joran Ok I added sample data thanks – jon Jun 20 '12 at 17:09

2 Answers2

19

Here is partial solution using geom_text() to add labels and geom_segment() with the arrow option to add arrows.

The drawback is that I had to manually choose y positions for each arrow and label. Maybe someone else can help figure out how programmatically find the histogram bar heights.

set.seed(123)
myd <- data.frame (class = rep(1:4, each = 100), yvar = rnorm(400, 50,30))

library(ggplot2)
library(grid) # unit() is in the grid package.

arrow_pos = read.table(header=TRUE, stringsAsFactors=FALSE,
                       text="class   name       yvar
                             2       subject4    104.0
                             3       subject3    8.5
                             3       subject1    80.0
                             4       subject2    40.0
                             4       subject1    115.0")

arrow_pos$y = c(3, 5, 9, 13, 1) # Manually enter y position.
arrow_pos$class = factor(as.character(arrow_pos$class),
    levels=c("1", "2", "3", "4")) # Gets rid of warnings.

p1 = ggplot(myd, aes(x=yvar)) +
     theme_bw() +
     geom_histogram(colour="grey40", fill="grey40", binwidth=10) +
     facet_wrap(~ class) +
     opts(panel.margin=unit(0 ,"lines")) +
     geom_text(data=arrow_pos, aes(label=name, x=yvar, y=y + 2), size=3) +
     geom_segment(data=arrow_pos, 
                  aes(x=yvar, xend=yvar, y=y + 1.5, yend=y + 0.25),
                  arrow=arrow(length=unit(2, "mm")))

png("p1.png", height=600, width=600)
print(p1)
dev.off()

enter image description here

bdemarest
  • 14,397
  • 3
  • 53
  • 56
  • 2
    thank you, I wish I would able to accept your answer as you are first to answer and initiate the idea, however for quick quiding which answer is perfect answer (such as height) I am choosing the second answer...but deserve appreciation token – jon Jun 22 '12 at 13:55
  • 4
    I'm happy to be able to help! I agree that @Sandy Muspratt deserves the credit for the full solution. – bdemarest Jun 22 '12 at 16:35
16

Update opts is deprecated; use theme instead.

Extending bdemarest's response a little, I think this calculates the bar heights programatically. The last two columns of arrow_pos contain the relevant information: Freq is the height of the bar; xval in the x position of the midpoint of the bar. But still, some of the labels overlap the bars.

EDIT By default cut bounds its intervals as (b1, b2], whereas it appeas that ggplot2 bounds its intervals in geom_histogram as [b1, b2). I've modified the code so that both bound their intervals as [b1, b2), ie the ggplot way.

library(ggplot2)
library(grid) # unit() is in the grid package.
library(plyr)  # Data restructuring

set.seed(123)
myd <- data.frame (class = rep(1:4, each = 100), yvar = rnorm(400, 50, 30))

arrow_pos = read.table(header=TRUE, stringsAsFactors=FALSE,
                       text="class   name       yvar
                             2       subject4    104.0
                             3       subject3    8.5
                             3       subject1    80.0
                             4       subject2    40.0
                             4       subject1    115.0")

# Calculate the y positions for the labels and arrows
# For the myd data frame, obtain counts within each bin, but separately for each class
bwidth <- 10   # Set binwidth
Min <- floor(min(myd$yvar)/bwidth) * bwidth
Max <- ceiling(max(myd$yvar)/bwidth) * bwidth

# Function to do the counting
func <- function(df) {
   tab = as.data.frame(table(cut(df$yvar, breaks = seq(Min, Max, bwidth), right = FALSE)))
   tab$upper = Min + bwidth * (as.numeric(rownames(tab)))
   return(tab)
   }

# Apply the function to each class in myd data frame
TableOfCounts <- ddply(myd, .(class), function(df) func(df))

# Transfer counts of arrow_pos
arrow_pos$upper <- (floor(arrow_pos$yvar/bwidth) * bwidth) + bwidth
arrow_pos <- merge(arrow_pos, TableOfCounts, by = c("class", "upper"))
arrow_pos$xvar <- (arrow_pos$upper - .5 * bwidth)      # x position of the arrow is at the midpoint of the bin
arrow_pos$class=factor(as.character(arrow_pos$class),
    levels=c("1", "2", "3", "4")) # Gets rid of warnings.

ggplot(myd, aes(x=yvar)) +
     theme_bw() +
     geom_histogram(colour="grey70", fill="grey70", binwidth=bwidth) +
     facet_wrap(~ class) +
     theme(panel.margin=unit(0, "lines")) +
     geom_text(data=arrow_pos, aes(label=name, x=xvar, y=Freq + 2), size=4) +
     geom_segment(data=arrow_pos, 
                  aes(x=xvar, xend=xvar, y=Freq + 1.5, yend=Freq + 0.25),
                  arrow=arrow(length=unit(2, "mm")))

enter image description here

Sandy Muspratt
  • 31,719
  • 12
  • 116
  • 122