2

I've been trying desperately to create a simple cross tabulation/pivot table of some data I have. It's a huge data.frame with around 11,000 observations and over 100 variables, so for simplicity I've created a subset to use as an example here (see structure in last code chunk).

Each observation represents an individual household from a household survey. The two variable I'm trying to create a crosstab from are 'HouseholdSize' and 'buildingRef'. I want my crosstab to show a count of households that are a certain size against how many households were found within a particular building as represented by 'buildingRef'.

I've been playing with count (from plyr) and dcast (from reshape2) and have got my cross tab where I have 'householdSize' as my rows and 'freq' as my columns (where freq = count of buildingRef) using this:

hhsize_counted <- count(hh_table_short, c("householdSize","buildingRef"))
hhsize_counted <- dcast(hhsize_counted, householdSize~freq)

However I seem to be losing about 25 observations from the original dataframe and I'm not entirely sure why (hh_table_short has 200 obs; hhsize_counted has 175).

Can anyone help me understand why this is? Or crucially could anyone point me in the direction of another way of achieving the same thing!?

I've searched here and all over the web for solutions but to no avail, any help greatly appreciated!

thanks

Marty

    structure(list(buildingRef = structure(c(1L, 2L, 2L, 2L, 3L, 
4L, 5L, 6L, 6L, 6L, 7L, 8L, 9L, 10L, 10L, 10L, 10L, 11L, 12L, 
13L, 14L, 15L, 15L, 16L, 16L, 16L, 17L, 18L, 19L, 19L, 20L, 21L, 
22L, 22L, 23L, 23L, 23L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 26L, 
26L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 32L, 33L, 34L, 35L, 36L, 
37L, 38L, 39L, 40L, 41L, 42L, 43L, 43L, 44L, 45L, 46L, 46L, 47L, 
48L, 49L, 50L, 50L, 51L, 52L, 53L, 53L, 53L, 54L, 55L, 56L, 57L, 
58L, 59L, 60L, 60L, 61L, 62L, 62L, 63L, 64L, 64L, 64L, 65L, 66L, 
67L, 68L, 69L, 69L, 69L, 70L, 70L, 70L, 71L, 71L, 72L, 72L, 73L, 
74L, 74L, 75L, 75L, 75L, 75L, 76L, 77L, 78L, 78L, 79L, 79L, 79L, 
79L, 79L, 80L, 80L, 81L, 81L, 82L, 82L, 83L, 83L, 84L, 85L, 85L, 
85L, 85L, 85L, 85L, 86L, 87L, 87L, 88L, 88L, 88L, 88L, 89L, 90L, 
91L, 91L, 91L, 91L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 
99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L, 106L, 106L, 107L, 
108L, 108L, 108L, 109L, 110L, 111L, 111L, 111L, 111L, 111L, 111L, 
112L, 113L, 114L, 114L, 114L, 114L, 114L, 114L, 115L, 116L, 117L, 
118L, 119L, 119L, 119L, 120L), .Label = c("1001001031", "1002001029", 
"1002001060", "1002003013", "1002005026", "1002005060", "1002005088", 
"1002005111", "1002005135", "1002006021", "1002007024", "1004001030", 
"1005001032", "1005002011", "1005003008", "1005003036", "1005005005", 
"1005005030", "1005006068", "1005007012", "1005007043", "1005008019", 
"1005009005", "1005009032", "1005009057", "1005010012", "1005011010", 
"1005013052", "1005015012", "1005016024", "1005017002", "1005017042", 
"1005017077", "1006001008", "1006002010", "1006002039", "1006002063", 
"1006004001", "1006004028", "1006005015", "1006005035", "1006006012", 
"1006007015", "1006007040", "1006008012", "1006008035", "1006009024", 
"1006009052", "1006010015", "1006010044", "1006011032", "1006012001", 
"1006012029", "1006014004", "1006014055", "1006017009", "1007001038", 
"1007003019", "1007003043", "1007004007", "1007004057", "1007005008", 
"1008001030", "1008004006", "1008005024", "1008006014", "1008007019", 
"1009001008", "1009002017", "1009003001", "1009003031", "1009003055", 
"1009003080", "1009004008", "1009004034", "1009004057", "1009005024", 
"1009005053", "1009005077", "1010001005", "1010001046", "1010002011", 
"1010002034", "1010002056", "1010002083", "1010003036", "1010004001", 
"1010005003", "1010006005", "1010007005", "1011001011", "1011002003", 
"1011004012", "1011007002", "1011008016", "1012003036", "1012004008", 
"1012005003", "1012009015", "1013002002", "1013006004", "1013009002", 
"1014001013", "1014003010", "1014004025", "1014006010", "1014008003", 
"1014010011", "1015003013", "1015004018", "1015005009", "1015006015", 
"1015006042", "1015007025", "1015010002", "1015010030", "1015012014", 
"1016001002", "1016002004", "1016003019"), class = "factor"), 
    householdSize = c(5L, 5L, 5L, 3L, 4L, 5L, 4L, 4L, 4L, 7L, 
    5L, 4L, 4L, 2L, 1L, 0L, 4L, 0L, 6L, 7L, 12L, 3L, 2L, 4L, 
    3L, 4L, 4L, 9L, 6L, 4L, 6L, 6L, 3L, 2L, 4L, 3L, 5L, 4L, 3L, 
    2L, 2L, 1L, 1L, 7L, 5L, 7L, 4L, 2L, 6L, 7L, 2L, 5L, 3L, 2L, 
    6L, 12L, 5L, 4L, 9L, 10L, 8L, 7L, 6L, 5L, 2L, 0L, 2L, 4L, 
    5L, 3L, 3L, 2L, 4L, 2L, 1L, 4L, 5L, 10L, 1L, 1L, 4L, 4L, 
    4L, 4L, 7L, 23L, 4L, 6L, 1L, 5L, 4L, 4L, 2L, 1L, 0L, 2L, 
    8L, 9L, 7L, 7L, 7L, 6L, 4L, 4L, 4L, 7L, 3L, 15L, 6L, 6L, 
    3L, 5L, 8L, 5L, 4L, 4L, 11L, 4L, 7L, 1L, 1L, 1L, 3L, 3L, 
    6L, 2L, 1L, 9L, 4L, 15L, 1L, 5L, 1L, 1L, 2L, 10L, 11L, 2L, 
    8L, 15L, 9L, 7L, 2L, 9L, 4L, 4L, 0L, 2L, 6L, 5L, 2L, 2L, 
    2L, 3L, 4L, 5L, 9L, 26L, 6L, 7L, 3L, 3L, 4L, 9L, 1L, 6L, 
    4L, 4L, 4L, 3L, 5L, 3L, 5L, 4L, 2L, 0L, 5L, 7L, 2L, 6L, 2L, 
    0L, 6L, 5L, 8L, 12L, 7L, 5L, 6L, 5L, 5L, 5L, 4L, 2L, 3L, 
    6L, 3L, 3L, 3L, 2L)), .Names = c("buildingRef", "householdSize"
), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 
13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 
26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 
52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 
65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 
78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 87L, 88L, 89L, 90L, 91L, 
92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L, 
104L, 105L, 106L, 107L, 108L, 109L, 110L, 111L, 112L, 113L, 114L, 
115L, 116L, 117L, 118L, 119L, 120L, 121L, 122L, 123L, 124L, 125L, 
126L, 127L, 128L, 129L, 130L, 131L, 132L, 133L, 134L, 135L, 136L, 
137L, 138L, 139L, 140L, 141L, 142L, 143L, 144L, 145L, 146L, 147L, 
148L, 149L, 150L, 151L, 152L, 153L, 154L, 155L, 156L, 157L, 158L, 
159L, 160L, 161L, 162L, 163L, 165L, 166L, 167L, 168L, 169L, 170L, 
171L, 172L, 173L, 174L, 175L, 176L, 177L, 178L, 179L, 181L, 182L, 
183L, 184L, 185L, 186L, 187L, 188L, 189L, 190L, 191L, 192L, 193L, 
194L, 195L, 196L, 197L, 198L, 199L, 200L, 201L, 202L, 203L, 205L
), class = "data.frame")
marty_c
  • 5,779
  • 5
  • 24
  • 27

1 Answers1

1

You're very close, and your first line to count the rows is good:

hhsize_counted <- count(hh_table_short, c("householdSize","buildingRef"))

but you need to set func.aggregate to sum when you do dcast (rather than the default of length). Otherwise, it's treating your original hhsize_counted dataset like the frequency is always 1.

hhsize_counted <- dcast(hhsize_counted, householdSize~freq, fun.aggregate=sum)
David Robinson
  • 77,383
  • 16
  • 167
  • 187