0

I have 2 data frame with some matching columns (pollutants). The first data frame contains the observations while the second one contains different thresholds for some pollutants.

Here a small subset of both data frames:

dput(df1)
structure(list(sample = structure(27:76, .Label = c("A_1", "A_2", 
"A_LS", "A_PC", "A_PM", "B_1", "B1_1", "B1_2", "B1-8_PC", "B1-8_PM", 
"B1_LS", "B1_PC", "B1_PM", "B_2", "B2_1", "B2_2", "B2-8_PC", 
"B2-8_PM", "B2_LS", "B2_PC", "B2_PM", "B_LS", "B_PC", "B_PM", 
"C_1", "C_2", "C386", "C387", "C388", "C389", "C390", "C391", 
"C392", "C393", "C394", "C395", "C396", "C397", "C398", "C399", 
"C400", "C401", "C402", "C403", "C404", "C405", "C406", "C407", 
"C408", "C409", "C410", "C411", "C412", "C413", "C414", "C415", 
"C416", "C417", "C418", "C419", "C420", "C421", "C422", "C423", 
"C424", "C425", "C426", "C427", "C428", "C429", "C430", "C431", 
"C432", "C433", "C434", "C435", "C436", "C437", "C438", "C439", 
"C440", "C441", "C442", "C443", "C444", "C445", "C446", "C447", 
"C448", "C449", "C450", "C451", "C452", "C453", "C454", "C455", 
"C456", "C457", "C458", "C459", "C460", "C461", "C462", "C463", 
"C464", "C465", "C466", "C467", "C468", "C469", "C470", "C471", 
"C472", "C473", "C474", "C475", "C476", "C477", "C478", "C479", 
"C480", "C481", "C482", "C483", "C484", "C485", "C486", "C487", 
"C488", "C489", "C490", "C491", "C492", "C493", "C494", "C495", 
"C496", "C497", "C498", "C499", "C500", "C501", "C502", "C503", 
"C504", "C505", "C506", "C507", "C508", "C509", "C510", "C511", 
"C512", "C513", "C514", "C515", "C516", "C517", "C518", "C519", 
"C520", "C521", "C522", "C523", "C524", "C-8_PC", "C-8_PM", "D_1", 
"D_2", "E_1", "E_2", "F_1", "F_2"), class = "factor"), As = c(9, 
8.75, 13.5, 7.75, 7.6, 8.33, 8, 8.75, 7.4, 8.25, 8.17, 7.75, 
7.6, 7.5, 7.2, 8, 7.83, 7.75, 7, 7.5, 8.17, 8.75, 6.67, 7, 5.83, 
6.75, 5.6, 6.4, 6.2, 6.2, 6.2, 6.25, 7, 6, 6, 6.4, 6, 5.8, 5.6, 
6, 5.8, 7.25, 8.8, 8.5, 8, 8.25, 8.25, 8.5, 8.25, 8.25), Al = c(30245, 
38060, 36280, 24355, 27776, 35190, 38733.8, 36400, 29624, 33699.75, 
32163.33, 30645.75, 31373, 26647.5, 19987.6, 32210, 27158, 24220.25, 
18598.5, 23081.75, 29393, 26800.5, 22581.67, 29290, 29651.67, 
20947.5, 19762.6, 23815, 32784.8, 20696.2, 26880.6, 25087.75, 
19497.2, 21794, 32232, 24253.4, 20034, 21270, 22510, 15170.25, 
8956.6, 21612.25, 35828, 30006.25, 27128.75, 25835, 31118.75, 
35614.5, 37440.25, 33736.75), Hg = c(0.25, 0.35, 0.48, 1.03, 
1.12, 0.2, 1.14, 0.4, 2, 0.48, 0.85, 0.18, 0.76, 0.4, 0.48, 0.35, 
0.32, 0.33, 0.4, 0.13, 0.15, 0.13, 0.87, 0.12, 0.03, 0.33, 0.2, 
0.22, 0.04, 0.16, 0.1, 0.18, 0.11, 0.08, 0.03, 0.06, 0.06, 0.1, 
0.03, 0.07, 0.03, 0.1, 0.08, 0.11, 0.1, 0.13, 0.08, 0.12, 0.07, 
0.09)), .Names = c("sample", "As", "Al", "Hg"), row.names = c(NA, 
50L), class = "data.frame")

and

dput(df2)
structure(list(As = c(25L, 32L), Hg = c(0.4, 0.8), Cr = c(100L, 
360L), Element = structure(c(1L, 3L), .Label = c("LCB", "LCB_pelite", 
"LCL"), class = "factor")), .Names = c("As", "Hg", "Cr", "Element"
), row.names = c(NA, -2L), class = "data.frame")

Actually the original data frames are bigger, but this subset gives the idea.

What I want now is to put in a 3rd data frames the values of each element of the first df that exceed the threshold values contained in the second df.

Be aware that there are 2 different threshold values (for each element) in df2 and df2 has some element not matched in df1 (for example Cr).

I've tried to write a for loop but I was able to do that just for 1 element at a time:

for (i in df2$As)  {
    print(length(which(df1$As > i)))
}

I've also tried to use nested for loops but without success..

matteo
  • 4,683
  • 9
  • 41
  • 77
  • What is your expected output? In your third df, do you want separate informations for each threshold, or cases that exceed the lower or higher threshold? – Molx Mar 23 '15 at 18:35
  • I'd like to know the number of cases that the element exceed the threshold values.. – matteo Mar 23 '15 at 18:54

3 Answers3

1

I'm pretty sure this does not look good, but I think it works. I added some extra lines to match only the elements found in both data frames, which in this case is only 1. It might ned some changes for your full data:

df1.2 <- rbind(df1, df1) #Duplicate the df1 to compare to each threshold value
df1.2 <- df1.2[order(df1.2$sample),] #Order by sample again
cols2 <- na.omit(match(colnames(df1), colnames(df2)))[[1]] #Get the columns of df2 which are in df1
cols1 <- na.omit(match(colnames(df2), colnames(df1)))[[1]] #Get the columns of df1 which are in df2
df2.2 <- df2[rep(1:2, nrow(df1)),cols2] #Replicates df2 the number of times to allow matching the thresholds to each sample, once for each threshold
exceeds <- df1.2[,cols1]>df2.2 #Make the comparions and return a boolean
sum(exceeds) #You will need colSums() for more than one column

With your sample data it's also not clear from the answer which elements ir refers to, but this shouldn't happen if more than one element matches and your result is a matrix.

Maybe there's a more elegant way without replicating the dataframes and having to worry about number of element matches.

Molx
  • 6,816
  • 2
  • 31
  • 47
  • Thanks for the effort.. your solution is a little bit complicated, but it is still a solution.. – matteo Mar 24 '15 at 07:49
  • @matteo You're welcome. Despite looking more complicated than the other proposed solution (does it?), I think it would be more efficient for large data, and is also more idiomatic since no for loops are involved. Yet, you should of course use the code that fits you best. – Molx Mar 24 '15 at 12:43
  • 1
    @Molx, my solution does loop through the individual pollutants, which I am guessing is not a too very long list. The calculation through the (probably) much longer list of samples is done vector-wise with`sum`. – DaveTurek Mar 24 '15 at 14:48
1
df3=data.frame(Pollutant="Z",LCB=0,LCL=0,stringsAsFactors=FALSE)
for (p in names(df1)[-1]) {
  if(p %in% names(df2)[1:(length(df2)-1)]) {
    df3 = rbind(df3,c(p,sum(df1[p]>df2[[p]][1]),sum(df1[p]>df2[[p]][2])))
  }
}
df3=df3[-1,]
df3

Update:

Ah, each new row is rbound as a character vector. To finish up:

str(df3)
df3$LCB=as.numeric(df3$LCB)
df3$LCL=as.numeric(df3$LCL)
str(df3)
DaveTurek
  • 1,297
  • 7
  • 8
0

How about this?

foo <- function(x, y) {
    sapply(x, function(i) sum(y>i))
}

cols = c("As", "Hg")
mapply(foo, df2[cols], df1[cols])
#      As Hg
# [1,]  0 10
# [2,]  0  6

Convert this to a data.frame if necessary.

Arun
  • 116,683
  • 26
  • 284
  • 387
  • Since there could be many more pollutants - some not common to both data frames - how about `cols = names(df1)[names(df1) %in% names(df2)]` – DaveTurek Mar 26 '15 at 16:20