skip the empty dataframes and produce the output

Question

 sessionInfo()
R version 3.2.0 (2015-04-16)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1

locale:
[1] LC_COLLATE=German_Germany.1252  LC_CTYPE=German_Germany.1252   
[3] LC_MONETARY=German_Germany.1252 LC_NUMERIC=C                   
[5] LC_TIME=German_Germany.1252    

attached base packages:
[1] grid      stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] WriteXLS_3.5.1  tidyr_0.2.0     scales_0.2.4    gridExtra_0.9.1
[5] ggplot2_1.0.1   RPostgreSQL_0.4 DBI_0.3.1      

loaded via a namespace (and not attached):
 [1] Rcpp_0.11.6      assertthat_0.1   dplyr_0.4.1      digest_0.6.8    
 [5] MASS_7.3-40      plyr_1.8.2       gtable_0.1.2     magrittr_1.5    
 [9] stringi_0.4-1    lazyeval_0.1.10  reshape2_1.4.1   proto_0.3-10    
[13] tools_3.2.0      stringr_1.0.0    munsell_0.4.2    parallel_3.2.0  
[17] colorspace_1.2-6

#

library(RPostgreSQL)
    library(ggplot2)
    library(gridExtra)
    library(scales)
    library(tidyr)
    blue.bold.italic.16.text <- element_text(face = "bold", color = "black", size = 12)

#

Consider four machines running in parllel and producing products. And the each dataframe(l1,l2,l3,l4) below represents no of pieces per hour for each machine (actually I collect data from database using RPostgreSQL and these is sample how it looks)

l1 <- structure(list(hours = structure(c(1434081600, 1434085200, 1434088800, 
1434092400, 1434096000, 1434099600, 1434103200, 1434106800, 1434110400, 
1434114000, 1434117600, 1434121200, 1434124800, 1434128400, 1434132000, 
1434135600, 1434139200, 1434142800, 1434146400, 1434150000, 1434153600, 
1434157200, 1434160800, 1434164400), class = c("POSIXct", "POSIXt"
), tzone = ""), count = c(25, 29, 28, 32, 33, 13, 33, 29, 32, 
33, 27, 34, 25, 30, 13, 24, 26, 33, 40, 34, 26, 30, 22, 30)), .Names = c("hours", 
"count"), row.names = c(NA, 24L), class = "data.frame")

l2 <- structure(list(hours = structure(c(1434081600, 1434085200, 1434088800, 
1434092400, 1434096000, 1434099600, 1434103200, 1434106800, 1434110400, 
1434114000, 1434117600, 1434121200, 1434124800, 1434128400, 1434132000, 
1434135600, 1434139200, 1434142800, 1434146400, 1434150000, 1434153600, 
1434157200, 1434160800, 1434164400), class = c("POSIXct", "POSIXt"
), tzone = ""), count = c(25, 29, 28, 32, 33, 13, 33, 29, 32, 
33, 27, 34, 25, 30, 13, 24, 26, 33, 40, 34, 26, 30, 22, 30)), .Names = c("hours", 
"count"), row.names = c(NA, 24L), class = "data.frame")

l3 <- structure(list(hours = structure(c(1434081600, 1434085200, 1434088800, 
1434092400, 1434096000, 1434099600, 1434103200, 1434106800, 1434110400, 
1434114000, 1434117600, 1434121200, 1434124800, 1434128400, 1434132000, 
1434135600, 1434139200, 1434142800, 1434146400, 1434150000, 1434153600, 
1434157200, 1434160800, 1434164400), class = c("POSIXct", "POSIXt"
), tzone = ""), count = c(25, 29, 28, 32, 33, 13, 33, 29, 32, 
33, 27, 34, 25, 30, 13, 24, 26, 33, 40, 34, 26, 30, 22, 30)), .Names = c("hours", 
"count"), row.names = c(NA, 24L), class = "data.frame")

l4 <- structure(list(hours = structure(c(1434081600, 1434085200, 1434088800, 
1434092400, 1434096000, 1434099600, 1434103200, 1434106800, 1434110400, 
1434114000, 1434117600, 1434121200, 1434124800, 1434128400, 1434132000, 
1434135600, 1434139200, 1434142800, 1434146400, 1434150000, 1434153600, 
1434157200, 1434160800, 1434164400), class = c("POSIXct", "POSIXt"
), tzone = ""), count = c(25, 29, 28, 32, 33, 13, 33, 29, 32, 
33, 27, 34, 25, 30, 13, 24, 26, 33, 40, 34, 26, 30, 22, 30)), .Names = c("hours", 
"count"), row.names = c(NA, 24L), class = "data.frame")

# here is my script for the attached plot(output)

df <- merge(l1,l2, by="hours")
df <- merge(df,l3, by="hours")
df <- merge(df,l4, by="hours")

colnames(df) <- c("hours","L 1","L 2","L 3","L 4")
pd <- gather(df, 'Ls', 'count', 2:5)

q <- ggplot(pd, aes(x = hours, y = count)) + geom_bar(stat = "identity") + theme(legend.position = "none")+
  xlab("Time") + ylab("No.Of Pecies") +
  ggtitle("my sample")+
  scale_y_continuous(breaks=seq(0,45, by = 5))+
  theme(axis.text = blue.bold.italic.16.text) +
  scale_x_datetime(breaks=date_breaks("2 hour"),minor_breaks=date_breaks("2 hour"),labels=date_format("%H")) + 
  theme(axis.text.x=element_text(angle=0))+
  facet_grid(~ Ls)

enter image description here

# when all the 4 machines are working - everything is fine, i will run the above script and i will get the rquired output.

Incase if any machine is not working and i have a dataframe with empty rows..then i will get an error while running my script file.

 @ df <- merge(l1,l2, by="hours")
    df <- merge(df,l3, by="hours")
    df <- merge(df,l4, by="hours")
Error in fix.by(by.y, y) : 'by' must specify a uniquely valid column

and the next error at

pd <- gather(df, 'Ls', 'count', 2:5)

how to avoid the empty dataframes and run the script succesfully to produce the output with whatever the no of machines are operating (either it is 2 or 3 or 4)

How does yr data.frame with "empty rows" look like? Contains zeros, NAs, no rows at all? — vaettchen, Jun 18 '15 at 15:57
By the way, thanks for providing data in a useful format and a good MWE — vaettchen, Jun 19 '15 at 10:11

vaettchen · Accepted Answer · 2015-06-19T04:10:00.253

Judging from the error message, the data.frame that causes the error has neither rows nor columns, it seems to be NULL. So the easiest way would be to check for that situation and if the data.frame is NULL, create a a dummy that can be merge()d and gather()ed.

What I would do (not saying this is the best way) is

# for easier looping, put your data.frames in a list
l <- list( l1, l2, l3, l4 )

# create a dummy that mimics the structure of your data.frames
dummy <- structure( list( hours = structure( c( Sys.time() ), 
                          class = c( "POSIXct", "POSIXt" ), tzone = ""),
                          count = c(0)), .Names = c("hours", "count"),
                          row.names = c(NA, 1L), class = "data.frame")

# check for empty data.frames and replace with dummy (will be NA)
for( i in 1:4 ) if( length( l[[ i ]] ) == 0 ) l[[ i ]] <- dummy

# merge
for( i in 2:4 ) l[[ 1 ]] <- merge( l[[ 1 ]], l[[ i ]], 
                                   by = "hours", all = TRUE )

# remove dummy and go back to your code
df <- l[[ 1 ]][ 1:24, ]
colnames( df ) <- c( "hours","L 1","L 2","L 3","L 4" )

There is room for improvement but at least it should display the results, whether or not a machine is operating:

l2 <- NULL

Machine 2 not operating

I think @vaettchen means accept the answer on stack overflow. To do that, click the checkmark of the answer (it's located under the vote buttons). — Heroka, Jun 19 '15 at 10:08
@chanti - there is a button below the top right corner of my answer that gives you the opportunity to mark the answer as "accepted". That is what I mean. Of course, I'm even more happy to hear that you actually use my ideas... — vaettchen, Jun 19 '15 at 10:10

score 1 · Answer 2 · answered Jun 18 '15 at 22:45

One alternative would be to skip the merging all together and go right to stacking the datasets. You would just need to add the Ls column to each individual dataset first.

l1$Ls = "L 1"
l2$Ls = "L 2"
l3$Ls = "L 3"
l4$Ls = "L 4"

Then you could use, e.g., bind_rows from dplyr to make your long dataset pd.

bind_rows(l1, l2, l3, l4)

Source: local data frame [96 x 3]

                 hours count  Ls
1  2015-06-11 21:00:00    25 L 1
2  2015-06-11 22:00:00    29 L 1
3  2015-06-11 23:00:00    28 L 1
4  2015-06-12 00:00:00    32 L 1
5  2015-06-12 01:00:00    33 L 1
6  2015-06-12 02:00:00    13 L 1
7  2015-06-12 03:00:00    33 L 1
8  2015-06-12 04:00:00    29 L 1
9  2015-06-12 05:00:00    32 L 1
10 2015-06-12 06:00:00    33 L 1
..                 ...   ... ...

The positive of this approach is that one of the objects you bind can be an empty data.frame or NULL and it still works.

Example empty data.frame:

l4.2 = data.frame()

bind_rows(l1, l2, l3, l4.2)

Source: local data frame [72 x 3]

                 hours count  Ls
1  2015-06-11 21:00:00    25 L 1
2  2015-06-11 22:00:00    29 L 1
3  2015-06-11 23:00:00    28 L 1
4  2015-06-12 00:00:00    32 L 1
5  2015-06-12 01:00:00    33 L 1
6  2015-06-12 02:00:00    13 L 1
7  2015-06-12 03:00:00    33 L 1
8  2015-06-12 04:00:00    29 L 1
9  2015-06-12 05:00:00    32 L 1
10 2015-06-12 06:00:00    33 L 1
..                 ...   ... ...

Example NULL:

l4.3 = NULL

bind_rows(l1, l2, l3, l4.3)

Source: local data frame [72 x 3]

                 hours count  Ls
1  2015-06-11 21:00:00    25 L 1
2  2015-06-11 22:00:00    29 L 1
3  2015-06-11 23:00:00    28 L 1
4  2015-06-12 00:00:00    32 L 1
5  2015-06-12 01:00:00    33 L 1
6  2015-06-12 02:00:00    13 L 1
7  2015-06-12 03:00:00    33 L 1
8  2015-06-12 04:00:00    29 L 1
9  2015-06-12 05:00:00    32 L 1
10 2015-06-12 06:00:00    33 L 1
..                 ...   ... ...

skip the empty dataframes and produce the output

2 Answers2