0

Let's consider this data set:

df <- data.frame(age=   sample(c(20:90), 20, rep=T), 
             sex =  sample(c('m', 'f'), 20, rep=T),
             smoker=sample(c("never", "former", "active"), 20, rep=T),
             size=  sample (c(8:40), 20, rep=T),
             fac =  as.factor(sample(c("neg","lo","med","hi"), 20, rep=T)),
             outcome = sample(c(0,1), 20, rep=T)
             )
# let's introduce some missing data         
for (i in (1:3)) {df[sample(c(1:20),1),  sample(c(1:6),1)]  <- NA}

In a medical manuscript the first table summarizes the population (or its subgroups as appropriate); here the rows would be age, sex, smoking status, etc and the two outcomes would be listed in separate columns. The continuous variables are reported as means; the categorical variables as counts.

  1. I was wondering if there is a function that I am missing that creates such contingency tables. I can do that manually but would love to be able to automatically update if the data set changes. Ultimately I need to output in latex.
  2. the function would need to ignore missing data, but not delete those rows.

Asking too much?!

  • 1
    Default `table` 'ignores' `NA`. Compare `with(df, table(fac, outcome))` and `with(df, table(fac, outcome, useNA = "always"))` or `with(df, table(fac, outcome, exclude = NULL))`. – Henrik Aug 13 '13 at 06:29
  • 1
    Regarding `can't find object 'fac'`, you need to refer to your dataset `df`. – Henrik Aug 13 '13 at 06:32
  • 1
    Regarding `table(na.omit(mean(age)), outcome)`: when you have considered the previous comment and checked `?mean`, `na.rm`, you may wonder again why it does not work, and what you really wish to achieve with this table. – Henrik Aug 13 '13 at 07:24
  • I'm having trouble understanding. Since `age` is continuous, do you want a single row with the mean `age` as the rowname and the two columns for the counts the two outcomes? – Gregor Thomas Aug 13 '13 at 20:07
  • the continuous variables would be represented as mean for the subsets in each column ie. mean(age) in outcome==0 and mean(age) in outcome==1 – K Owen - Reinstate Monica Aug 13 '13 at 20:53

1 Answers1

0

In medical articles, 'Table 1' summarizes the demographics of the study population, usually broken down between subgroups

Generate data set

n <- 100
df <- data.frame(
age = sample(c(20:90), n, rep = T), 
sex = sample(c("m", "f"), 20, rep = T, prob = c(0.55, 0.45)), 
smoker = sample(c("never", "former", "active"), n, rep = T, prob = c(0.4, 0.45, 0.15)), 
size = abs(rnorm(n, 20, 8)), 
logitest = sample(c(TRUE, FALSE), n, rep = T, prob = c(0.1, 0.9)), 
labtest = as.factor(sample(c("neg", "lo", quot;med",quot;hi"), n, rep = T, prob = c(0.4, 0.3, 0.2, 0.1))), 
outcome = sample(c(0, 1), n, rep = T, prob = c(0.8, 0.2))
)

# let's introduce some missing data
for (i in (1:floor(n/6))) {
    df[sample(c(1:n), 1), sample(c(1:ncol(df)), 1)] <- NA
}
head(df)
##   age sex smoker  size logitest labtest outcome
## 1  70   m former 39.17       NA     med      NA
## 2  51   f former 33.64    FALSE      hi       1
## 3  58   f former 10.10    FALSE     neg       1
## 4  30   m former 43.24    FALSE     med       0
## 5  54   m former 22.78    FALSE      lo       0
## 6  86   f former  8.20    FALSE     neg       0
if working a real data set, use it instead
# df <- read.csv() 
#you may need to eliminate some columns    
#colnames(df) 
#df0<-df #backup 
#df <- df[,-c(1,...,27:38)]

Change this as needed: the column with the diagnosis has to be removed from the variables list!

dx <- 7  #index of outcome/diagnosis
####################################
summary(df[, -dx])
##       age         sex        smoker        size        logitest      
##  Min.   :20.0   f   :44   active:19   Min.   : 0.91   Mode :logical  
##  1st Qu.:42.5   m   :54   former:49   1st Qu.:15.00   FALSE:85       
##  Median :58.0   NA's: 2   never :30   Median :20.12   TRUE :12       
##  Mean   :57.3             NA's  : 2   Mean   :20.44   NA's :3        
##  3rd Qu.:74.0                         3rd Qu.:27.10                  
##  Max.   :88.0                         Max.   :43.24                  
##  NA's   :1                            NA's   :2                      
##  labtest  
##  hi  : 4  
##  lo  :29  
##  med :20  
##  neg :45  
##  NA's: 2  
##           
## 
attach(df)

Build list of vars

vars <- colnames(df)
vars
## [1] "age"      "sex"      "smoker"   "size"     "logitest" "labtest" 
## [7] "outcome"
catvars <- NULL  #categorical variables
contvars <- NULL  #continuous variables
logivars <- NULL  #logic variables

vars <- vars[-dx]
vars
## [1] "age"      "sex"      "smoker"   "size"     "logitest" "labtest"
for (i in 1:length(vars)) {
    ifelse(is.factor(df[, i]), catvars <- c(catvars, vars[i]), ifelse(is.logical(df[, 
        i]), logivars <- c(logivars, vars[i]), contvars <- c(contvars, vars[i])))
}
contvars
## [1] "age"  "size"
catvars
## [1] "sex"     "smoker"  "labtest"
logivars
## [1] "logitest"

Create the subgroups

bg <- df[df[, dx] == 0 & !is.na(df[, dx]), ]
nrow(bg)  #; bg
## [1] 73
mg <- df[df[, dx] == 1 & !is.na(df[, dx]), ]
nrow(mg)  #; mg
## [1] 23
indet <- df[is.na(df[, dx]), ]
nrow(indet)
## [1] 4
indet
##    age sex smoker   size logitest labtest outcome
## 1   70   m former 39.173       NA     med      NA
## 9   87   m former 23.621    FALSE      lo      NA
## 18  65   m former  2.466    FALSE    <NA>      NA
## 67  88   f former 17.575    FALSE     med      NA

For continuous variables

normality testing
normality <- NULL
for (i in 1:length(contvars)) {
    j <- which(vars == contvars[i])  #find position of variable in the original data frame and its subsets
    st <- shapiro.test(df[, j])  #normality testing on all patients, bg and mg alike
    normality <- c(normality, st$p.value)  #normality testing on all patients, bg and mg alike
}
normality
## [1] 0.00125 0.73602
comparing the means of two samples; if normal, use t-test, otherwise wilcoxon
ttpvalue <- NULL
for (i in 1:length(contvars)) {
    j <- which(vars == contvars[i])  #find position of variable in the original data frame and its subsets
    ## if normal, use t-test, otherwise wilcoxon if shapiro p<.05 then pop
    ## likely NOT normally dist
    ifelse(normality[i] < 0.05, tt <- wilcox.test(bg[, j], mg[, j]), tt <- t.test(bg[, 
        j], mg[, j]))
    ttpvalue <- c(ttpvalue, tt$p.value)  ##if t-test p<.05 then pop likely have different means
}
ttpvalue
## [1] 0.6358 0.3673
contvarlist <- list(variables = contvars, normality = normality, ttest.by.subgroup = ttpvalue)

For categorical variables

chisqpvalue <- NULL
for (i in 1:length(catvars)) {
    j <- which(vars == catvars[i])  #find position of variable in the original data frame and its subsets
    tbl <- table(df[, j], df[, dx])
    chisqtest <- summary(tbl)
    chisqpvalue <- c(chisqpvalue, chisqtest$p.value)
}
chisqpvalue
## [1] 0.01579 0.77116 0.39484
catvarlist <- list(variables = catvars, chisq.by.subgroup = chisqpvalue)

For logic variables

proppvalue <- NULL
for (i in 1:length(logivars)) {
    j <- which(vars == logivars[i])  #find position of variable in the original data frame and its subsets
    tbl <- table(df[, j], df[, dx])
    chisqtest <- summary(tbl)
    proppvalue <- c(proppvalue, chisqtest$p.value)
}
proppvalue
## [1] 0.5551
logivarlist = list(variables = logivars, chisq.by.subgroup = proppvalue)

And now, the results!

str(contvarlist)  #if shapiro p<.05 then pop likely NOT normally dist; if t-test p<.05 then pop likely have different means
## List of 3
##  $ variables        : chr [1:2] "age" "size"
##  $ normality        : num [1:2] 0.00125 0.73602
##  $ ttest.by.subgroup: num [1:2] 0.636 0.367
str(catvarlist)  #if chisq p<.05 then variables are likely NOT independent
## List of 2
##  $ variables        : chr [1:3] "sex" "smoker" "labtest"
##  $ chisq.by.subgroup: num [1:3] 0.0158 0.7712 0.3948
str(logivarlist)  #if chisq p<.05 then variables are likely NOT independent
## List of 2
##  $ variables        : chr "logitest"
##  $ chisq.by.subgroup: num 0.555