0

So for my analysis I need to create 4 subsamples in my sample:

I want to compare Voters' turnout percentage among

1) Voters with 0 other voters in the household
2) Voters with 1 other voter in the household
3) Voters with 2 other voters in the household
4) Voters with 3+ other voters in the household

I have 7 variables for that:
1) Size of Household (vn437)
2) Age of 2nd person in HH (vn438a)
3) Age of 3rd person in HH (vn438b)
and so on until 6th person's age in HH

In my logic I would need to create 4 subsamples, for each group:

First group (0 other voters in the HH) would be observations that fulfill:

d$vn437 == 1;  
d$vn437 == 2 & d$vn438a < 18;  
d$vn437 == 3 & d$vn438a < 18 & d$vn438b < 18  
d$vn437 == 4 & d$vn438a < 18 & d$vn438b < 18 & d$vn438c < 18; 

and so on until I finish with 'd$vn438e < 18'

I am an ultra noob with R and I have no idea how to go about this.
How would I create these groups? I am really desperate and I've been looking for hours to no avail!

As Richard Telford suggested, here is the output of the 'dput(head(d)) command:

structure(list(dat = c(20091026, 20091025, 20091025, 20091026, 
20091025, 20091025), vn1 = c(1, 2, 1, 1, 1, 1), vn542 = c(27, 
22, 25, 23, 24, 22), vn217 = c(4, 3, 2, 4, 3, 3), n111 = c(1, 
1, 1, 2, 1, 1), vn437 = c(2, 2, 2, 2, 2, 2), vn438a = c(28, 24, 
24, 24, 23, 25), vn438b = c(1000, 1000, 1000, 1000, 1000, 1000
), vn438c = c(1000, 1000, 1000, 1000, 1000, 1000), vn438d = c(1000, 
1000, 1000, 1000, 1000, 1000), vn438e = c(1000, 1000, 1000, 1000, 
1000, 1000), vn5 = c(4, 4, 4, 4, 4, 4), vn9a = c(5, 5, 5, 5, 
5, 5), vn75 = c(1, 1, 3, 2, 1, 3), vn79 = c(2, 2, 2, 2, 2, 2)), .Names = c("dat", 
"vn1", "vn542", "vn217", "n111", "vn437", "vn438a", "vn438b", 
"vn438c", "vn438d", "vn438e", "vn5", "vn9a", "vn75", "vn79"), row.names = c(2174L, 
2175L, 2177L, 2178L, 2180L, 2181L), class = "data.frame")  

The vn438b = "1000" is the NA values but if I removed them I would lose other observations so I did not clean the Nth person in HH's age variables.

Also here is what I want my outcome to look like in the end

EDIT

Managed to solve it on my own. For anyone interested, here's my code:

# changing variable names into understandable names
colnames(d)[2] <- "sex"
colnames(d)[3] <- "age"
colnames(d)[4] <- "polint"
colnames(d)[5] <- "turnout"
colnames(d)[6] <- "HHsize"
colnames(d)[7] <- "HHage2"
colnames(d)[8] <- "HHage3"
colnames(d)[9] <- "HHage4"
colnames(d)[10] <- "HHage5"
colnames(d)[11] <- "HHage6"
colnames(d)[12] <- "marital"
colnames(d)[13] <- "education"
colnames(d)[14] <- "income"
colnames(d)[15] <- "religion"


####################################################################
## creating subsets: no other voters in HH --> combine them later ##
####################################################################
noHHM <- d[d$HHsize==1, ]
noHHM1 <- d[d$HHsize==2 & d$HHage2<18, ]
noHHM2 <- d[d$HHsize==3 & d$HHage2<18 & d$HHage3<18, ]
noHHM3 <- d[d$HHsize==4 & d$HHage2<18 & d$HHage3<18 & d$HHage4<18, ]
noHHM4 <- d[d$HHsize==5 & d$HHage2<18 & d$HHage3<18 & d$HHage4<18 & d$HHage5<18, ] 
#at this point no more observations match specifications so we remove noHHM4
rm(noHHM4)

#merging the noHHM variables
zeroHHM <- rbind(noHHM, noHHM1, noHHM2, noHHM3)

#removing intermediate variables now
rm(noHHM, noHHM1, noHHM2, noHHM3)

#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth0 <- zeroHHM[zeroHHM$age <26, ]
Old0 <- zeroHHM[zeroHHM$age >25, ]


##################################################
## repeat whole process for 1 other voter in HH ##
##################################################
one1HHM <- d[d$HHsize==2 & d$HHage2>17, ]
oneHHM1 <- d[d$HHsize==3 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18,  ]
oneHHM2 <- d[d$HHsize==4 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18 & d$HHage4<18, ]
oneHHM3 <- d[d$HHsize==5 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18 & d$HHage4<18 & d$HHage5<18, ]
oneHHM4 <- d[d$HHsize==6 & d$HHage2>17 & d$HHage2<900 & d$HHage3<18 & d$HHage4<18 & d$HHage5<18 & d$HHage6<18, ]

#merging the oneHHM variables
oneHHM <- rbind(one1HHM, oneHHM1, oneHHM2, oneHHM3, oneHHM4)

#removing intermediate variables now
rm(one1HHM, oneHHM1, oneHHM2, oneHHM3, oneHHM4)

#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth1 <- oneHHM[zeroHHM$age <26, ]
Old1 <- oneHHM[zeroHHM$age >25, ]


###################################################
## repeat whole process for 2 other voters in HH ##
###################################################
twoHHM1 <- d[d$HHsize==3 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900, ]
twoHHM2 <- d[d$HHsize==4 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4<18, ]
twoHHM3 <- d[d$HHsize==5 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4<18 & d$HHage5<18, ]
twoHHM4 <- d[d$HHsize==6 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4<18 & d$HHage5<18 & d$HHage6<18, ]

#merging the the twoHHM variables
twoHHM <- rbind(twoHHM1, twoHHM2, twoHHM3, twoHHM4)

#removing intermediate variables
rm(twoHHM1, twoHHM2, twoHHM3, twoHHM4)

#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth2 <- twoHHM[zeroHHM$age <26, ]
Old2 <- twoHHM[zeroHHM$age >25, ]


####################################################
## repeat whole process for 3+ other voters in HH ##
####################################################
threeHHM1 <- d[d$HHsize==4 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4>17 & d$HHage4<900, ]
threeHHM2 <- d[d$HHsize==5 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4>17 & d$HHage4<900, ]
threeHHM3 <- d[d$HHsize==6 & d$HHage2>17 & d$HHage2<900 & d$HHage3>17 & d$HHage3<900 & d$HHage4>17 & d$HHage4<900, ]

#merging the the threeHHM variables
threeHHM <- rbind(threeHHM1, threeHHM2, threeHHM3)

#removing intermediate variables
rm(threeHHM1, threeHHM2, threeHHM3)

#creating two subsets (youth voters [under 25] and non youth voters [over 25])
Youth3 <- threeHHM[zeroHHM$age <26, ]
Old3 <- threeHHM[zeroHHM$age >25, ]


#renaming the bigsets
HHM0 <- zeroHHM
HHM1 <- oneHHM
HHM2 <- twoHHM
HHM3 <- threeHHM
#removing old name bigsets
rm(zeroHHM, oneHHM, twoHHM, threeHHM)
Can Kay
  • 1
  • 1
  • please include some data in your question as it will help others to answer. Use `dput(head(d))` and paste the result. Also please give the intended outcome of your analysis - it will help make your question clearer. – Richard Telford Apr 22 '17 at 19:09
  • I did it as an edit, thank you for the suggestion – Can Kay Apr 22 '17 at 19:17
  • So I've been braining over this issue alone since I didn't get any replies on any platform and I figured out that it must be a subset function with multiple if statements. Something along the lines of: `noHHM <- d[vn437==1 & vn437==2 if vn438a<18 & vn437==3 if vn438a<18 && vn438b<18]` and so on. but I do not know the correct syntax and keep getting errors for unexpected 'if' I would be very grateful if someone could bump me in the right direction – Can Kay Apr 23 '17 at 13:48

0 Answers0