1

I have the dataset below:

##      ID        DOB sector meters Oct   Res_FROM     Res_TO   Exp_FROM
## 1 20100 1979-08-24    H38   6400   W 1979-08-15 1991-05-15 1979-08-24
## 2 20101 1980-05-05    B01   1600  NW 1980-05-15 1991-04-15 1980-05-15
## 3 20102 1979-03-17    H04   1600  SW 1972-06-15 1979-08-15 1979-03-17
## 4 20103 1981-11-30    B09   3200  NE 1982-01-15 1984-01-15 1982-01-15
## 5 20103 1981-11-30    B37   8000   N 1984-01-15 1986-04-15 1984-01-15
## 6 20104 1978-09-01    B09   3200  NE 1982-01-15 1984-01-15 1982-01-15
##       Exp_TO Exps_Grp Yr1952 Yr1953 Yr1954 Yr1955 Yr1956 Yr1957 Yr1958 Yr1959
## 1 1988-12-31  fr51>88     NA     NA     NA     NA     NA     NA     NA     NA
## 2 1988-12-31  fr51>88     NA     NA     NA     NA     NA     NA     NA     NA
## 3 1979-08-15  between     NA     NA     NA     NA     NA     NA     NA     NA
## 4 1984-01-15  between     NA     NA     NA     NA     NA     NA     NA     NA
## 5 1986-04-15  between     NA     NA     NA     NA     NA     NA     NA     NA
## 6 1984-01-15  between     NA     NA     NA     NA     NA     NA     NA     NA
##   Yr1960 Yr1961 Yr1962 Yr1963 Yr1964 Yr1965 Yr1966 Yr1967 Yr1968 Yr1969 Yr1970
## 1     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA
## 2     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA
## 3     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA
## 4     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA
## 5     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA
## 6     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA     NA
##   Yr1971 Yr1972 Yr1973 Yr1974 Yr1975 Yr1976 Yr1977 Yr1978    Yr1979   Yr1980
## 1     NA     NA     NA     NA     NA     NA     NA     NA  5.950991 4.340588
## 2     NA     NA     NA     NA     NA     NA     NA     NA        NA 2.927725
## 3     NA     NA     NA     NA     NA     NA     NA     NA 20.608986       NA
## 4     NA     NA     NA     NA     NA     NA     NA     NA        NA       NA
## 5     NA     NA     NA     NA     NA     NA     NA     NA        NA       NA
## 6     NA     NA     NA     NA     NA     NA     NA     NA        NA       NA
##     Yr1981    Yr1982    Yr1983    Yr1984   Yr1985    Yr1986   Yr1987   Yr1988
## 1 4.340588  4.340588  4.340588 4.3405881 4.340588 4.3405881 4.340588 1.083782
## 2 4.447229  4.447229  4.447229 4.4472289 4.447229 4.4472289 4.447229 1.110409
## 3       NA        NA        NA        NA       NA        NA       NA       NA
## 4       NA 15.365412 16.018407 0.6529943       NA        NA       NA       NA
## 5       NA        NA        NA 2.9414202 3.052618 0.6918076       NA       NA
## 6       NA 15.365412 16.018407 0.6529943       NA        NA       NA       NA
##     Yrs_Exp arth_mean    median cumulative  caldate Age Month_Res
## 1 9.3616438  4.175948  4.340588  41.759478 12/31/88   9       141
## 2 8.6356164  3.907637  4.447229  35.168736 12/31/88   9       131
## 3 0.4136986 20.608986 20.608986  20.608986 12/31/88   9        86
## 4 2.0000000 10.678938 15.365412  32.036813 12/31/88   9        24
## 5 2.2493151  2.228615  2.941420   6.685846 12/31/88   8        27
## 6 2.0000000 10.678938 15.365412  32.036813 12/31/88   9        24

I am needing to normalize the cumulative exposures for each individual (ID) by the population of my dataset and by the residence time of each person. How would I do this in R? I have provided a small data set, but in reality I have about 14,000 rows. I've included code for the data as well.

dat <- structure(list(ID = c(20100L, 20101L, 20102L, 20103L, 20103L,
20104L, 20104L, 20105L, 20105L, 20106L, 20106L), DOB = c("1979-08-24",
"1980-05-05", "1979-03-17", "1981-11-30", "1981-11-30", "1978-09-01",
"1978-09-01", "1980-12-03", "1980-12-03", "1978-04-25", "1978-04-25"
), sector = c("H38", "B01", "H04", "B09", "B37", "B09", "B37",
"B09", "B09", "B09", "B09"), meters = c(6400L, 1600L, 1600L,
3200L, 8000L, 3200L, 8000L, 3200L, 3200L, 3200L, 3200L), Oct = c("W",
"NW", "SW", "NE", "N", "NE", "N", "NE", "NE", "NE", "NE"), Res_FROM = c("1979-08-15",
"1980-05-15", "1972-06-15", "1982-01-15", "1984-01-15", "1982-01-15",
"1984-01-15", "1980-12-15", "1983-08-15", "1978-04-15", "1983-08-15"
), Res_TO = c("1991-05-15", "1991-04-15", "1979-08-15", "1984-01-15",
"1986-04-15", "1984-01-15", "1986-04-15", "1983-08-15", "1991-03-15",
"1983-08-15", "2000-01-15"), Exp_FROM = c("1979-08-24", "1980-05-15",
"1979-03-17", "1982-01-15", "1984-01-15", "1982-01-15", "1984-01-15",
"1980-12-15", "1983-08-15", "1978-04-25", "1983-08-15"), Exp_TO = c("1988-12-31",
"1988-12-31", "1979-08-15", "1984-01-15", "1986-04-15", "1984-01-15",
"1986-04-15", "1983-08-15", "1988-12-31", "1983-08-15", "1988-12-31"
), Exps_Grp = c("fr51>88", "fr51>88", "between", "between", "between",
"between", "between", "between", "fr51>88", "between", "fr51>88"
), Yr1952 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1953 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1954 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1955 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1956 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1957 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1958 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Yr1959 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1960 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1961 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1962 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1963 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1964 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1965 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Yr1966 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1967 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1968 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1969 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1970 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1971 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA), Yr1972 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), Yr1973 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1974 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Yr1975 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), Yr1976 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), Yr1977 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA), Yr1978 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
79.39642441, NA), Yr1979 = c(5.950991161, NA, 20.60898553, NA,
NA, NA, NA, NA, NA, 59.94484924, NA), Yr1980 = c(4.340588078,
2.927724588, NA, NA, NA, NA, NA, 0.758267013, NA, 16.01840668,
NA), Yr1981 = c(4.340588078, 4.447228937, NA, NA, NA, NA, NA,
16.01840668, NA, 16.01840668, NA), Yr1982 = c(4.340588078, 4.447228937,
NA, 15.36541238, NA, 15.36541238, NA, 16.01840668, NA, 16.01840668,
NA), Yr1983 = c(4.340588078, 4.447228937, NA, 16.01840668, NA,
16.01840668, NA, 9.952203009, 6.066203667, 9.952203009, 6.066203667
), Yr1984 = c(4.340588078, 4.447228937, NA, 0.652994292, 2.941420153,
0.652994292, 2.941420153, NA, 16.01840668, NA, 16.01840668),
    Yr1985 = c(4.340588078, 4.447228937, NA, NA, 3.052618478,
    NA, 3.052618478, NA, 16.01840668, NA, 16.01840668), Yr1986 = c(4.340588078,
    4.447228937, NA, NA, 0.691807598, NA, 0.691807598, NA, 16.01840668,
    NA, 16.01840668), Yr1987 = c(4.340588078, 4.447228937, NA,
    NA, NA, NA, NA, NA, 16.01840668, NA, 16.01840668), Yr1988 = c(1.083782142,
    1.110408824, NA, NA, NA, NA, NA, NA, 3.999564755, NA, 3.999564755
    ), Yrs_Exp = c(9.361643836, 8.635616438, 0.41369863, 2, 2.249315068,
    2, 2.249315068, 2.665753425, 5.383561644, 5.309589041, 5.383561644
    ), arth_mean = c(4.175947792, 3.907637331, 20.60898553, 10.67893778,
    2.22861541, 10.67893778, 2.22861541, 10.68682084, 12.35656585,
    32.89144945, 12.35656585), median = c(4.340588078, 4.447228937,
    20.60898553, 15.36541238, 2.941420153, 15.36541238, 2.941420153,
    12.98530484, 16.01840668, 16.01840668, 16.01840668), cumulative = c(41.75947792,
    35.16873597, 20.60898553, 32.03681335, 6.685846229, 32.03681335,
    6.685846229, 42.74728337, 74.13939513, 197.3486967, 74.13939513
    ), caldate = c("12/31/88", "12/31/88", "12/31/88", "12/31/88",
    "12/31/88", "12/31/88", "12/31/88", "12/31/88", "12/31/88",
    "12/31/88", "12/31/88"), Age = c(9L, 9L, 9L, 9L, 8L, 9L,
    7L, 7L, 10L, 10L, 8L), Month_Res = c(141L, 131L, 86L, 24L,
    27L, 24L, 27L, 32L, 91L, 64L, 197L)), class = "data.frame", row.names = c(NA,
-11L))
barnsm2
  • 185
  • 7
  • 4
    Please include the desired output in your question. – Vishal A. Feb 23 '22 at 05:47
  • 3
    could you add a little bit to your definition of normalize? Is it for linear modeling? Have you tried log or root transformations? What about the function ```scale()``` – Andy Feb 23 '22 at 06:55
  • I've seen the use of scale, but I'll look more into it to see if it fits my needs. I'm just needing to normalize mainly by population. I've read that you can divide something, such as cumulative exposure per individual and divide by the total population. Then I could multiple by 100 to make the result out of 100 people. But is this normalization? – barnsm2 Feb 23 '22 at 14:55

0 Answers0