I've been trying desperately to create a simple cross tabulation/pivot table of some data I have. It's a huge data.frame with around 11,000 observations and over 100 variables, so for simplicity I've created a subset to use as an example here (see structure in last code chunk).
Each observation represents an individual household from a household survey. The two variable I'm trying to create a crosstab from are 'HouseholdSize' and 'buildingRef'. I want my crosstab to show a count of households that are a certain size against how many households were found within a particular building as represented by 'buildingRef'.
I've been playing with count (from plyr) and dcast (from reshape2) and have got my cross tab where I have 'householdSize' as my rows and 'freq' as my columns (where freq = count of buildingRef) using this:
hhsize_counted <- count(hh_table_short, c("householdSize","buildingRef"))
hhsize_counted <- dcast(hhsize_counted, householdSize~freq)
However I seem to be losing about 25 observations from the original dataframe and I'm not entirely sure why (hh_table_short has 200 obs; hhsize_counted has 175).
Can anyone help me understand why this is? Or crucially could anyone point me in the direction of another way of achieving the same thing!?
I've searched here and all over the web for solutions but to no avail, any help greatly appreciated!
thanks
Marty
structure(list(buildingRef = structure(c(1L, 2L, 2L, 2L, 3L,
4L, 5L, 6L, 6L, 6L, 7L, 8L, 9L, 10L, 10L, 10L, 10L, 11L, 12L,
13L, 14L, 15L, 15L, 16L, 16L, 16L, 17L, 18L, 19L, 19L, 20L, 21L,
22L, 22L, 23L, 23L, 23L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 26L,
26L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 32L, 33L, 34L, 35L, 36L,
37L, 38L, 39L, 40L, 41L, 42L, 43L, 43L, 44L, 45L, 46L, 46L, 47L,
48L, 49L, 50L, 50L, 51L, 52L, 53L, 53L, 53L, 54L, 55L, 56L, 57L,
58L, 59L, 60L, 60L, 61L, 62L, 62L, 63L, 64L, 64L, 64L, 65L, 66L,
67L, 68L, 69L, 69L, 69L, 70L, 70L, 70L, 71L, 71L, 72L, 72L, 73L,
74L, 74L, 75L, 75L, 75L, 75L, 76L, 77L, 78L, 78L, 79L, 79L, 79L,
79L, 79L, 80L, 80L, 81L, 81L, 82L, 82L, 83L, 83L, 84L, 85L, 85L,
85L, 85L, 85L, 85L, 86L, 87L, 87L, 88L, 88L, 88L, 88L, 89L, 90L,
91L, 91L, 91L, 91L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L,
99L, 100L, 101L, 102L, 103L, 104L, 105L, 106L, 106L, 106L, 107L,
108L, 108L, 108L, 109L, 110L, 111L, 111L, 111L, 111L, 111L, 111L,
112L, 113L, 114L, 114L, 114L, 114L, 114L, 114L, 115L, 116L, 117L,
118L, 119L, 119L, 119L, 120L), .Label = c("1001001031", "1002001029",
"1002001060", "1002003013", "1002005026", "1002005060", "1002005088",
"1002005111", "1002005135", "1002006021", "1002007024", "1004001030",
"1005001032", "1005002011", "1005003008", "1005003036", "1005005005",
"1005005030", "1005006068", "1005007012", "1005007043", "1005008019",
"1005009005", "1005009032", "1005009057", "1005010012", "1005011010",
"1005013052", "1005015012", "1005016024", "1005017002", "1005017042",
"1005017077", "1006001008", "1006002010", "1006002039", "1006002063",
"1006004001", "1006004028", "1006005015", "1006005035", "1006006012",
"1006007015", "1006007040", "1006008012", "1006008035", "1006009024",
"1006009052", "1006010015", "1006010044", "1006011032", "1006012001",
"1006012029", "1006014004", "1006014055", "1006017009", "1007001038",
"1007003019", "1007003043", "1007004007", "1007004057", "1007005008",
"1008001030", "1008004006", "1008005024", "1008006014", "1008007019",
"1009001008", "1009002017", "1009003001", "1009003031", "1009003055",
"1009003080", "1009004008", "1009004034", "1009004057", "1009005024",
"1009005053", "1009005077", "1010001005", "1010001046", "1010002011",
"1010002034", "1010002056", "1010002083", "1010003036", "1010004001",
"1010005003", "1010006005", "1010007005", "1011001011", "1011002003",
"1011004012", "1011007002", "1011008016", "1012003036", "1012004008",
"1012005003", "1012009015", "1013002002", "1013006004", "1013009002",
"1014001013", "1014003010", "1014004025", "1014006010", "1014008003",
"1014010011", "1015003013", "1015004018", "1015005009", "1015006015",
"1015006042", "1015007025", "1015010002", "1015010030", "1015012014",
"1016001002", "1016002004", "1016003019"), class = "factor"),
householdSize = c(5L, 5L, 5L, 3L, 4L, 5L, 4L, 4L, 4L, 7L,
5L, 4L, 4L, 2L, 1L, 0L, 4L, 0L, 6L, 7L, 12L, 3L, 2L, 4L,
3L, 4L, 4L, 9L, 6L, 4L, 6L, 6L, 3L, 2L, 4L, 3L, 5L, 4L, 3L,
2L, 2L, 1L, 1L, 7L, 5L, 7L, 4L, 2L, 6L, 7L, 2L, 5L, 3L, 2L,
6L, 12L, 5L, 4L, 9L, 10L, 8L, 7L, 6L, 5L, 2L, 0L, 2L, 4L,
5L, 3L, 3L, 2L, 4L, 2L, 1L, 4L, 5L, 10L, 1L, 1L, 4L, 4L,
4L, 4L, 7L, 23L, 4L, 6L, 1L, 5L, 4L, 4L, 2L, 1L, 0L, 2L,
8L, 9L, 7L, 7L, 7L, 6L, 4L, 4L, 4L, 7L, 3L, 15L, 6L, 6L,
3L, 5L, 8L, 5L, 4L, 4L, 11L, 4L, 7L, 1L, 1L, 1L, 3L, 3L,
6L, 2L, 1L, 9L, 4L, 15L, 1L, 5L, 1L, 1L, 2L, 10L, 11L, 2L,
8L, 15L, 9L, 7L, 2L, 9L, 4L, 4L, 0L, 2L, 6L, 5L, 2L, 2L,
2L, 3L, 4L, 5L, 9L, 26L, 6L, 7L, 3L, 3L, 4L, 9L, 1L, 6L,
4L, 4L, 4L, 3L, 5L, 3L, 5L, 4L, 2L, 0L, 5L, 7L, 2L, 6L, 2L,
0L, 6L, 5L, 8L, 12L, 7L, 5L, 6L, 5L, 5L, 5L, 4L, 2L, 3L,
6L, 3L, 3L, 3L, 2L)), .Names = c("buildingRef", "householdSize"
), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L,
26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L,
39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L, 51L,
52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 64L,
65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L,
78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 87L, 88L, 89L, 90L, 91L,
92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L, 101L, 102L, 103L,
104L, 105L, 106L, 107L, 108L, 109L, 110L, 111L, 112L, 113L, 114L,
115L, 116L, 117L, 118L, 119L, 120L, 121L, 122L, 123L, 124L, 125L,
126L, 127L, 128L, 129L, 130L, 131L, 132L, 133L, 134L, 135L, 136L,
137L, 138L, 139L, 140L, 141L, 142L, 143L, 144L, 145L, 146L, 147L,
148L, 149L, 150L, 151L, 152L, 153L, 154L, 155L, 156L, 157L, 158L,
159L, 160L, 161L, 162L, 163L, 165L, 166L, 167L, 168L, 169L, 170L,
171L, 172L, 173L, 174L, 175L, 176L, 177L, 178L, 179L, 181L, 182L,
183L, 184L, 185L, 186L, 187L, 188L, 189L, 190L, 191L, 192L, 193L,
194L, 195L, 196L, 197L, 198L, 199L, 200L, 201L, 202L, 203L, 205L
), class = "data.frame")