0

Update: Gone manual, cut and paste into multiple sheets. Would be great to find a workaround all the same.

Question: Given the dummy data set below:

structure(list(V1 = structure(c(8L, 6L, 2L, 4L, 1L, 1L, 1L, 1L, 
9L, 5L, 2L, 1L, 1L, 1L, 1L, 10L, 7L, 3L), .Label = c("", "1", 
"12", "5", "Age", "Class A", "Height", "Number of Boys", "More Boys", 
"More Girls"), class = "factor"), V2 = structure(c(1L, 5L, 3L, 
4L, 1L, 1L, 1L, 1L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 7L, 2L), .Label = c("", 
"12", "2", "6", "Class B", "Time", "Weight"), class = "factor"), 
    V3 = structure(c(1L, 5L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 6L, 
    3L, 1L, 1L, 1L, 1L, 1L, 7L, 2L), .Label = c("", "13", "3", 
    "7", "Class C", "Next", "Time"), class = "factor"), V4 = structure(c(1L, 
    5L, 3L, 4L, 1L, 1L, 1L, 1L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 
    6L, 2L), .Label = c("", "14", "4", "8", "Class D", "Day"), class = "factor"), 
    V5 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA), V6 = c(NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V7 = c(NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
    ), V8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA), V9 = c(NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V10 = structure(c(5L, 
    4L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L), .Label = c("", "1", "8", "Class E", "Number of Girls"
    ), class = "factor"), V11 = structure(c(1L, 4L, 3L, 2L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
    "2", "8", "Class F"), class = "factor"), V12 = structure(c(1L, 
    4L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L), .Label = c("", "3", "9", "Class G"), class = "factor"), 
    V13 = structure(c(1L, 4L, 2L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "0", "4", 
    "Class Q"), class = "factor"), V14 = c(NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA, 
-18L))

This visually looks like(truncated):

               V1      V2      V3      V4 V5 V6 V7 V8 V9             V10     V11     V12     V13 V14
1  Number of Boys                         NA NA NA NA NA Number of Girls                          NA
2            Class A Class B Class C Class D NA NA NA NA NA         Class E Class F Class G Class Q  NA
3                  1       2       3       4 NA NA NA NA NA               8       8       9       0  NA
4                  5       6       7       8 NA NA NA NA NA               1       2       3       4  NA
5                                            NA NA NA NA NA                                          NA
6                                            NA NA NA NA NA                                          NA
7                                            NA NA NA NA NA                                          NA
8                                            NA NA NA NA NA                                          NA
9          More Boys                         NA NA NA NA NA                                          NA
10               Age    Time    Next     Day NA NA NA NA NA                                          NA
11                 1       2       3       4 NA NA NA NA NA                                          NA
12                                           NA NA NA NA NA                                          NA

As can hopefully be seen, these are separate "files" on the same worksheet. I've looked for a quick way to scoop out the different data sets but haven't(yet). Could there be a work around? My idea was to use a sequence based selection, say select every 20 rows but this will obviously fail if you have millions of rows?

Expected Ouptut(with their respective rows)

Three data sets: 
A: Number of Boys
B: Number of Girls
C: More Boys

Thanks in advance.

NelsonGon
  • 13,015
  • 7
  • 27
  • 57
  • 1
    Are the different tables in the original excel-file defined as separate tables or named regions? If so, you can address these separately using e.g. `read.xlsx` package. – Simon Apr 10 '19 at 17:02
  • They're in the same sheet. Different names, separated by "gaps". – NelsonGon Apr 10 '19 at 17:03

2 Answers2

1

Using @alexis_laz's solution at https://stackoverflow.com/a/42120347/6197649

library(Matrix)

x <- !is.na(df) & df != ""
m <- Matrix(x)

#> 18 x 14 sparse Matrix of class "lgCMatrix"
#>    [[ suppressing 14 column names 'V1', 'V2', 'V3' ... ]]
#>                                  
#>  [1,] | . . . . . . . . | . . . .
#>  [2,] | | | | . . . . . | | | | .
#>  [3,] | | | | . . . . . | | | | .
#>  [4,] | | | | . . . . . | | | | .
#>  [5,] . . . . . . . . . . . . . .
#>  [6,] . . . . . . . . . . . . . .
#>  [7,] . . . . . . . . . . . . . .
#>  [8,] . . . . . . . . . . . . . .
#>  [9,] | . . . . . . . . . . . . .
#> [10,] | | | | . . . . . . . . . .
#> [11,] | | | | . . . . . . . . . .
#> [12,] . . . . . . . . . . . . . .
#> [13,] . . . . . . . . . . . . . .
#> [14,] . . . . . . . . . . . . . .
#> [15,] . . . . . . . . . . . . . .
#> [16,] | . . . . . . . . . . . . .
#> [17,] | | | | . . . . . . . . . .
#> [18,] | | | | . . . . . . . . . .

sm = as.matrix(summary(m))

d = dist(sm, "manhattan")

gr = cutree(hclust(d, "single"), h = 1)

res <- sparseMatrix(i = sm[, "i"], j = sm[, "j"], x = gr)

#> 18 x 13 sparse Matrix of class "dgCMatrix"
#>                                
#>  [1,] 1 . . . . . . . . 4 . . .
#>  [2,] 1 1 1 1 . . . . . 4 4 4 4
#>  [3,] 1 1 1 1 . . . . . 4 4 4 4
#>  [4,] 1 1 1 1 . . . . . 4 4 4 4
#>  [5,] . . . . . . . . . . . . .
#>  [6,] . . . . . . . . . . . . .
#>  [7,] . . . . . . . . . . . . .
#>  [8,] . . . . . . . . . . . . .
#>  [9,] 2 . . . . . . . . . . . .
#> [10,] 2 2 2 2 . . . . . . . . .
#> [11,] 2 2 2 2 . . . . . . . . .
#> [12,] . . . . . . . . . . . . .
#> [13,] . . . . . . . . . . . . .
#> [14,] . . . . . . . . . . . . .
#> [15,] . . . . . . . . . . . . .
#> [16,] 3 . . . . . . . . . . . .
#> [17,] 3 3 3 3 . . . . . . . . .
#> [18,] 3 3 3 3 . . . . . . . . .

res2 <- summary(res)

lapply(
  split(res2[, c("i", "j")], res2$x),
  function(area) {
    df[min(area$i):max(area$i), min(area$j):max(area$j), drop = FALSE]
  }
)

#> $`1`
#>                  V1      V2      V3      V4
#> 1 Number of Boys                        
#> 2           Class A Class B Class C Class D
#> 3                 1       2       3       4
#> 4                 5       6       7       8
#> 
#> $`2`
#>           V1   V2   V3  V4
#> 9  More Boys              
#> 10       Age Time Next Day
#> 11         1    2    3   4
#> 
#> $`3`
#>            V1     V2   V3  V4
#> 16 More Girls                
#> 17     Height Weight Time Day
#> 18         12     12   13  14
#> 
#> $`4`
#>               V10     V11     V12     V13
#> 1 Number of Girls                        
#> 2         Class E Class F Class G Class Q
#> 3               8       8       9       0
#> 4               1       2       3       4

Created on 2019-04-10 by the reprex package (v0.2.1)

Aurèle
  • 12,545
  • 1
  • 31
  • 49
1

The following code creates a list of data.frames with column names beeing set correctly. It relies, however, on the fact that in your sheet "columns of tables" are separated by at least one empty column...

df <- apply(df, 2, function(x) gsub("^$|^ $", NA, x))
empty_cols <- sapply(1:ncol(df), function(i){length(which(is.na(df[, i])))==nrow(df)})
start_cols <- c(1, which(diff(empty_cols)==-1)+1)
if (is.na(df[1, 1])) start_cols <- start_cols[-1]
start_rows <- lapply(start_cols, function(i){
  start_rows <- c(1, which(diff(is.na(df[, i]))==-1)+1)
  if (is.na(df[1, i])) start_rows <- start_rows[-1]
  start_rows})

end_rows <- lapply(start_cols, function(i){
  end_rows <- c(1, which(diff(is.na(df[, i]))==1))
  if (!is.na(df[nrow(df), i])) end_rows <- c(end_rows, nrow(df))
  end_rows[-1]})

data.sets <- list()
for (i in 1:length(start_cols)) {
    for (j in 1:length(start_rows[[i]])){

      col <- start_cols[i]
      row <- start_rows[[i]][j]
      start_row <- row+1
      end_row <- end_rows[[i]][j]
      name <- df[row, col]
      ncol <- which(diff(is.na(df[row+1, col:ncol(df)]))==1)[1]
      end_col <- col+ncol-1
      column_names <- df[start_row, col:end_col]
      data <- df[(start_row+1):end_row, col:end_col]
      data <- matrix(data, ncol = length(col:end_col))
      data <- as.data.frame(data)
      names(data) <- column_names
      data.sets[[name]] <- data
    }
}


> data.sets
$`Number of Boys`
  Class A Class B Class C Class D
1       1       2       3       4
2       5       6       7       8

$`More Boys`
  Age Time Next Day
1   1    2    3   4

$`More Girls`
  Height Weight Time Day
1     12     12   13  14

$`Number of Girls`
  Class E Class F Class G Class Q
1       8       8       9       0
2       1       2       3       4
Simon
  • 577
  • 3
  • 9