My post is concerning result that I Obtained and seems to be incoherent. My datas is :
DF=structure(list(Age = c(16L, 29L, 22L, 64L, 42L, 46L, 30L, 37L,
31L, 52L, 44L, 54L, 23L, 22L, 42L, 39L, 39L, 51L, 25L, 64L, 55L,
56L, 27L, 31L, 39L, 22L, 54L, 33L, 34L, 18L, 39L, 41L, 52L, 41L,
27L, 36L, 64L, 42L, 21L, 44L, 50L, 35L, 22L, 65L, 53L, 18L, 25L,
59L, 56L, 52L, 39L, 40L, 25L, 63L, 43L, 23L, 52L, 48L, 24L, 45L,
27L, 42L, 56L, 43L, 28L, 51L, 54L, 16L, 65L, 56L, 47L, 45L, 29L,
41L, 52L, 50L, 26L, 44L, 35L, 55L, 57L, 43L, 52L, 28L, 33L, 20L,
39L, 15L, 55L, 20L, 30L, 10L, 54L, 51L, 47L, 36L, 42L, 33L, 26L,
29L, 19L, 22L, 22L, 22L, 40L, 33L, 20L, 43L, 53L, 25L, 25L, 49L,
25L, 31L, 45L, 51L, 60L, 54L, 20L, 25L, 60L, 48L, 35L, 42L, 14L,
28L, 55L, 20L, 35L, 17L, 46L, 20L, 45L, 37L, 33L, 36L, 60L, 47L,
27L, 25L, 51L, 32L, 19L, 25L, 19L, 60L, 18L, 17L, 33L, 26L, 33L,
32L, 33L, 22L, 17L, 24L, 43L, 38L, 27L, 40L, 42L, 41L, 31L, 43L,
34L, 33L, 42L, 37L, 24L, 50L, 53L, 35L, 50L, 37L, 46L, 39L, 33L,
56L, 58L, 23L, 31L, 52L, 50L, 33L, 56L, 55L, 20L, 22L, 44L, 50L,
30L, 58L, 59L, 16L, 33L, 53L, 50L, 20L, 31L, 22L, 38L, 59L, 38L,
62L, 52L, 30L, 18L, 53L, 38L, 41L, 44L, 53L, 19L, 53L, 57L),
Sous_Categorie = c("7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7",
"7", "7", "7", "7,9", "8", "8", "8", "8", "8", "9", "9",
"11", "10,7", "10,8,9", "7", "7", "7", "7", "7", "7,8", "8",
"7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "8",
"11", "7", "12", "12", "12", "12", "12", "12", "12", "12",
"12", "12", "12", "12", "12", "12", "12", "12", "13", "13",
"13", "13", "13", "14", "14", "14", "14", "14", "14", "14",
"14", "14", "14", "14", "14", "14", "14", "14", "14", "14",
"14", "14", "14", "15", "15", "15", "15", "15", "17", "17",
"17", "17", "17", "17", "17", "17", "17", "17", "17", "17",
"17", "17", "17", "17", "17", "17", "17", "17", "17", "17",
"17", "17", "17", "17", "17", "17", "17", "17", "17", "17",
"17", "17", "17", "17", "17", "17", "17", "17", "17", "17",
"17", "17", "17", "17", "17", "17", "17", "17", "17", "17",
"17", "17", "17", "17", "17", "17", "17", "17", "17", "17",
"17", "17", "17", "17", "17", "17", "17", "17", "17", "17",
"17", "17", "17", "17", "17", "18", "18", "18", "7,12", "7,12",
"7,12", "7,12", "7,13,17", "7,16", "7,17", "7,17", "7,17",
"7,17", "7,17", "8,17", "8,17", "11,17", "7,17", "7,17",
"8,17", "7,17", "7,17", "12,14", "12,15", "17,18")), .Names = c("Age",
"Sous_Categorie"), row.names = c(NA, -215L), class = "data.frame")
The code that used with the help of stackoverflow member's (and thanks again)
cats <- unique(unlist(strsplit(DF$Sous_Categorie, ",")))
cat_perc <- function(cats, vec) {
# percentages
nums <- sapply(cats, function(cat) sum(grepl(cat, vec)))
perc <- nums/sum(nums)
final <- perc * length(vec)
df <- as.data.frame(as.list(final))
names(df) <- cats
return(df)
}
a=cat_perc(cats, DF$Sous_Categorie)
a=(a/sum(a))*100
The results that I obtained :
7 9 8 11 10 12 13 14
46.20061 1.215805 4.863222 0.9118541 0.6079027 6.68693 1.823708 6.382979
15 17 18 16
1.823708 27.96353 1.215805 0.3039514
These result are not at all coherent for me, according to the original datas DF, it is clear that I have more 17 than 7. Why I obtain such result ? Is it a coding problem or a statistical issue ?
Thanks a lot