0

My post is concerning result that I Obtained and seems to be incoherent. My datas is :

DF=structure(list(Age = c(16L, 29L, 22L, 64L, 42L, 46L, 30L, 37L, 
31L, 52L, 44L, 54L, 23L, 22L, 42L, 39L, 39L, 51L, 25L, 64L, 55L, 
56L, 27L, 31L, 39L, 22L, 54L, 33L, 34L, 18L, 39L, 41L, 52L, 41L, 
27L, 36L, 64L, 42L, 21L, 44L, 50L, 35L, 22L, 65L, 53L, 18L, 25L, 
59L, 56L, 52L, 39L, 40L, 25L, 63L, 43L, 23L, 52L, 48L, 24L, 45L, 
27L, 42L, 56L, 43L, 28L, 51L, 54L, 16L, 65L, 56L, 47L, 45L, 29L, 
41L, 52L, 50L, 26L, 44L, 35L, 55L, 57L, 43L, 52L, 28L, 33L, 20L, 
39L, 15L, 55L, 20L, 30L, 10L, 54L, 51L, 47L, 36L, 42L, 33L, 26L, 
29L, 19L, 22L, 22L, 22L, 40L, 33L, 20L, 43L, 53L, 25L, 25L, 49L, 
25L, 31L, 45L, 51L, 60L, 54L, 20L, 25L, 60L, 48L, 35L, 42L, 14L, 
28L, 55L, 20L, 35L, 17L, 46L, 20L, 45L, 37L, 33L, 36L, 60L, 47L, 
27L, 25L, 51L, 32L, 19L, 25L, 19L, 60L, 18L, 17L, 33L, 26L, 33L, 
32L, 33L, 22L, 17L, 24L, 43L, 38L, 27L, 40L, 42L, 41L, 31L, 43L, 
34L, 33L, 42L, 37L, 24L, 50L, 53L, 35L, 50L, 37L, 46L, 39L, 33L, 
56L, 58L, 23L, 31L, 52L, 50L, 33L, 56L, 55L, 20L, 22L, 44L, 50L, 
30L, 58L, 59L, 16L, 33L, 53L, 50L, 20L, 31L, 22L, 38L, 59L, 38L, 
62L, 52L, 30L, 18L, 53L, 38L, 41L, 44L, 53L, 19L, 53L, 57L), 
    Sous_Categorie = c("7", "7", "7", "7", "7", "7", "7", "7", 
    "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
    "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", 
    "7", "7", "7", "7,9", "8", "8", "8", "8", "8", "9", "9", 
    "11", "10,7", "10,8,9", "7", "7", "7", "7", "7", "7,8", "8", 
    "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "7", "8", 
    "11", "7", "12", "12", "12", "12", "12", "12", "12", "12", 
    "12", "12", "12", "12", "12", "12", "12", "12", "13", "13", 
    "13", "13", "13", "14", "14", "14", "14", "14", "14", "14", 
    "14", "14", "14", "14", "14", "14", "14", "14", "14", "14", 
    "14", "14", "14", "15", "15", "15", "15", "15", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "17", "17", "17", "17", "17", 
    "17", "17", "17", "17", "17", "18", "18", "18", "7,12", "7,12", 
    "7,12", "7,12", "7,13,17", "7,16", "7,17", "7,17", "7,17", 
    "7,17", "7,17", "8,17", "8,17", "11,17", "7,17", "7,17", 
    "8,17", "7,17", "7,17", "12,14", "12,15", "17,18")), .Names = c("Age", 
"Sous_Categorie"), row.names = c(NA, -215L), class = "data.frame")

The code that used with the help of stackoverflow member's (and thanks again)

    cats <- unique(unlist(strsplit(DF$Sous_Categorie, ",")))

cat_perc <- function(cats, vec) {
  # percentages
  nums <- sapply(cats, function(cat) sum(grepl(cat, vec)))
  perc <- nums/sum(nums)
  final <- perc * length(vec)
  df <- as.data.frame(as.list(final))
  names(df) <- cats
  return(df)
}

a=cat_perc(cats, DF$Sous_Categorie) 
a=(a/sum(a))*100

The results that I obtained :

 7           9        8        11        10      12       13       14
46.20061 1.215805 4.863222 0.9118541 0.6079027 6.68693 1.823708 6.382979
    15       17       18        16
1.823708 27.96353 1.215805 0.3039514

These result are not at all coherent for me, according to the original datas DF, it is clear that I have more 17 than 7. Why I obtain such result ? Is it a coding problem or a statistical issue ?

Thanks a lot

Community
  • 1
  • 1
ranell
  • 683
  • 13
  • 29
  • 1
    `grepl` is returning `TRUE` when it sees a 7, even if that 7 is in a 17. Better to use something like `table(unlist(strsplit(vec, ',')))` – alistaire Apr 14 '16 at 17:37
  • Thank you a lot ! it was a nightmare. Will use character instead of numbers. But if someone have idea how to do it with number, i'm interested. I tried the table unlist, i didn't work correctly for me. – ranell Apr 14 '16 at 18:08

1 Answers1

1

grepl is better for working with strings than numbers; in this case it sees 17 as a match for 7, which you don't want. You could write more complicated regex, but it's easier to treat your data as the numbers it is.

Refactored a little bit, chopping out some unnecessary bits:

cat_perc <- function(vec) {
    # percentages
    nums <- table(as.numeric(unlist(strsplit(vec, ','))))
    perc <- nums/sum(nums)
    final <- perc * length(vec)
    final_pct <- final / sum(final) * 100
    return(final_pct)
}

cat_perc(DF$Sous_Categorie) 
#          7          8          9         10         11         12         13         14 
# 28.8065844  4.9382716  1.6460905  0.8230453  1.2345679  9.0534979  2.4691358  8.6419753 
#        15         16         17         18 
# 2.4691358  0.4115226 37.8600823  1.6460905 

Alternatively, without the function:

nums <- table(as.numeric(unlist(strsplit(DF$Sous_Categorie, ','))))
a <- data.frame(nums / sum(nums) * length(DF$Sous_Categorie))
a$Freq <- a$Freq / sum(a$Freq) * 100
a
#    Var1       Freq
# 1     7 28.8065844
# 2     8  4.9382716
# 3     9  1.6460905
# 4    10  0.8230453
# 5    11  1.2345679
# 6    12  9.0534979
# 7    13  2.4691358
# 8    14  8.6419753
# 9    15  2.4691358
# 10   16  0.4115226
# 11   17 37.8600823
# 12   18  1.6460905

Add or drop the data.frame and subsetting depending on which format you prefer.

alistaire
  • 42,459
  • 4
  • 77
  • 117