1

Essentially my question is very easy. I want to know where inside the lm function, the data is subsetted (where the NA's are removed), based on all the variables used in the formula. The reason that I want to know that, is because I want to sum a variable using only the subsetted data (where the NA's are removed), and not my full dataset.

Background:

I tried to adapt the lm function in R, to account for the correct degrees of freedom when using weights in lm. I thought the easiest solution would be to count the sum of weights, for the subsetted data, contingent on the selected variables, from within the function. So first I see where lm subsets the dataset, based on the selected variable and there I count sum(ind$weight_freq), which I want to provide as one of the outputs for the function so that I can refer to it.

Example data:

library(dplyr)

set.seed(1024)

# individual (true) dataset
x <- round(rnorm(1e5))
y <- round(x + x^2 + rnorm(1e5))
ind <- data.frame(x, y)

# Create an NA value
ind[1,1] <- NA

ind <- ind %>%
  group_by(x, y) %>%
  summarize(weight_freq= n())

I started out simply by copying the lm code into a new function, and replacing all <- with <<-, to see where the data is subsetted, using lm_plus(y ~ x, data = ind, weights = weight_freq), and although I get the error, Error in lm_plus(y ~ x, data = ind, weights = weight_freq) : argument "offset" is missing, with no default, the code goes far enough for the subsetting to happen (because mf should be 99999 due to the on NA, and it is):

lm_plus <- function (formula, data, subset, weights, na.action, method = "qr", 
    model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, 
    contrasts = NULL, offset, ...) 
{
    ret.x <<- x
    ret.y <<- y
    cl <<- match.call()
    mf <<- match.call(expand.dots = FALSE)
    m <<- match(c("formula", "data", "subset", 
        "weights", "na.action", "offset"), 
        names(mf), 0L)
    mf <<- mf[c(1L, m)]
    mf$drop.unused.levels <<- TRUE
    mf[[1L]] <<- quote(stats::model.frame)
    mf <<- eval(mf, parent.frame())
    if (method == "model.frame") 
        return(mf)
    else if (method != "qr") 
        warning(gettextf("method = '%s' is not supported. Using 'qr'", 
            method), domain = NA)
    mt <<- attr(mf, "terms")
    y <<- model.response(mf, "numeric")
    w <<- as.vector(model.weights(mf))
    if (!is.null(w) && !is.numeric(w)) 
        stop("'weights' must be a numeric vector")
    offset <<- model.offset(mf)
    mlm <<- is.matrix(y)
    ny <<- if (mlm) 
        nrow(y)
    else length(y)
    if (!is.null(offset)) {
        if (!mlm) 
            offset <<- as.vector(offset)
        if (NROW(offset) != ny) 
            stop(gettextf("number of offsets is %d, should equal %d (number of observations)", 
                NROW(offset), ny), domain = NA)
    }
    if (is.empty.model(mt)) {
        x <<- NULL
        z <<- list(coefficients = if (mlm) matrix(NA_real_, 0, 
            ncol(y)) else numeric(), residuals = y, fitted.values = 0 * 
            y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w != 
            0) else ny)
        if (!is.null(offset)) {
            z$fitted.values <<- offset
            z$residuals <<- y - offset
        }
    }
    else {
        x <<- model.matrix(mt, mf, contrasts)
        z <<- if (is.null(w)) 
            lm.fit(x, y, offset = offset, singular.ok = singular.ok, 
                ...)
        else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, 
            ...)
    }
    class(z) <<- c(if (mlm) "mlm", "lm")
    z$na.action <<- attr(mf, "na.action")
    z$offset <<- offset
    z$contrasts <<- attr(x, "contrasts")
    z$xlevels <<- .getXlevels(mt, mf)
    z$call <<- cl
    z$terms <<- mt
    if (model) 
        z$model <<- mf
    if (ret.x) 
        z$x <<- x
    if (ret.y) 
        z$y <<- y
    if (!qr) 
        z$qr <<- NULL
    z
}

I then tried renaming each of the mf instances to mf1, mf2, mf3, to see where mf is actually subsetted, but I get stuck because I am getting errors (even though I thought I made sure I had the reference between the mf correctly.

I also tried to put in test <<- sum(mf$weights, na.rm=TRUE) here and there, but without success.

Is there anyone who could help me out with summing the weights in the right place?

Tom
  • 2,173
  • 1
  • 17
  • 44
  • When you say "where is the data subsetted?", do you mean "where are the `NA` values removed?" – Allan Cameron Dec 27 '20 at 10:42
  • @AllanCameron Yes, exactly. Used your phrasing to improve the explanation. – Tom Dec 27 '20 at 10:43
  • Aren't they removed inside `model.frame` rather than inside `lm`? – Allan Cameron Dec 27 '20 at 10:45
  • @AllanCameron, that would explain why I could not solve it haha. Honestly the code is a bit too complicated for me. The whole point is that I do not understand where it happens. – Tom Dec 27 '20 at 10:46
  • I don't know that for sure (can't check at the moment) but it seems likely. The `na.action` is passed to `model.frame` and I'm sure if you look at the source code you'll see where the `NA` values are dropped – Allan Cameron Dec 27 '20 at 10:48
  • Okay, I think I figure it out. I added `sum_of_weights <<- sum(as.vector(model.weights(mf)))` underneath `w <- as.vector(model.weights(mf))`, and then added `z$df.residual <- sum_of_weights - length(coef(z))` before the last line `z`. – Tom Dec 27 '20 at 11:02

1 Answers1

1

This works:

EDIT: As a small note. I think it is better to simply overwrite the function altogether. I do not think there would be any downside to it and I noticed that some other lm dependent functions start complaining when you don't use lm.

lm_plus <- function (formula, data, subset, weights, na.action, method = "qr", 
    model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE, 
    contrasts = NULL, offset, ...) 
{
    ret.x <- x
    ret.y <- y
    cl <- match.call()
    mf <- match.call(expand.dots = FALSE)
    m <- match(c("formula", "data", "subset", 
        "weights", "na.action", "offset"), 
        names(mf), 0L)
    mf <- mf[c(1L, m)]
    mf$drop.unused.levels <- TRUE
    mf[[1L]] <- quote(stats::model.frame)
    mf <- eval(mf, parent.frame())
    # test <<- sum(mf$weights, na.rm=TRUE)
    if (method == "model.frame") 
        return(mf)
    else if (method != "qr") 
        warning(gettextf("method = '%s' is not supported. Using 'qr'", 
            method), domain = NA)
    mt <- attr(mf, "terms")
    y <- model.response(mf, "numeric")
    w <- as.vector(model.weights(mf))
    sum_of_weights <- sum(as.vector(model.weights(mf)))
    if (!is.null(w) && !is.numeric(w)) 
        stop("'weights' must be a numeric vector")
    offset <- model.offset(mf)
    mlm <- is.matrix(y)
    ny <- if (mlm) 
        nrow(y)
    else length(y)
    if (!is.null(offset)) {
        if (!mlm) 
            offset <- as.vector(offset)
        if (NROW(offset) != ny) 
            stop(gettextf("number of offsets is %d, should equal %d (number of observations)", 
                NROW(offset), ny), domain = NA)
    }
    if (is.empty.model(mt)) {
        x <- NULL
        z <- list(coefficients = if (mlm) matrix(NA_real_, 0, 
            ncol(y)) else numeric(), residuals = y, fitted.values = 0 * 
            y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w != 
            0) else ny)
        if (!is.null(offset)) {
            z$fitted.values <- offset
            z$residuals <- y - offset
        }
    }
    else {
        x <- model.matrix(mt, mf, contrasts)
        z <- if (is.null(w)) 
            lm.fit(x, y, offset = offset, singular.ok = singular.ok, 
                ...)
        else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, 
            ...)
    }
    class(z) <- c(if (mlm) "mlm", "lm")
    z$na.action <- attr(mf, "na.action")
    z$offset <- offset
    z$contrasts <- attr(x, "contrasts")
    z$xlevels <- .getXlevels(mt, mf)
    z$call <- cl
    z$terms <- mt
    if (model) 
        z$model <- mf
    if (ret.x) 
        z$x <- x
    if (ret.y) 
        z$y <- y
    if (!qr) 
        z$qr <- NULL
    z$df.residual <- sum_of_weights - length(coef(z))
    z
}

Example data:

library(dplyr)
library(modelsummary)

set.seed(1024)

# individual (true) dataset
x <- round(rnorm(1e5))
y <- round(x + x^2 + rnorm(1e5))
ind <- data.frame(x, y)

# aggregated dataset
agg <- ind %>%
  group_by(x, y) %>%
  summarize(freq = n())

# Note that the last entry uses lm_plus
models <- list( 
  "True"                = lm(y ~ x, data = ind),
  "Aggregated"          = lm(y ~ x, data = agg),
  "Aggregated & W"      = lm(y ~ x, data = agg, weights=freq),
  "Aggregated & W & DF" = lm_plus(y ~ x, data = agg, weights=freq)
)

modelsummary(models, fmt=5)

enter image description here

Tom
  • 2,173
  • 1
  • 17
  • 44