1

I am having trouble to do a merge based on the nearest numeric value between two sets of data in R.

I have a numeric vector that contains some numeric id values (id_1) and a data.frame with related (integer) id values and measured variables that I would like to merge (df_to_merge).

A minimum example representing my data is

set.seed(123)
    id_1 <- sort(runif(1499, min=0, max=750))
    
    df_to_merge <- data.frame(
      id_2 = c(2,20,45,60,72,100,150,200),
      var1 = runif(8, min = 0, max = 30),
      var2 = runif(8, min = 0, max = 10),
      var3 = runif(8, min = 0, max = 3)
      )

> df_to_merge
  id_2      var1     var2      var3
1    2 28.705000 3.279207 1.6321981
2   20 13.600025 9.545036 1.7824261
3   45 20.327119 8.895393 0.8674792
4   60 17.179002 6.928034 0.4413409
5   72  3.087740 6.405068 2.8890727
6  100 26.994749 9.942698 2.7068971
7  150  7.382632 6.557058 2.0721158
8  200  1.261786 7.085305 2.3864023

Obviously, a simple merge as follows does not work as the id values id_1 and id_2 are not matching (or only partially matching):

df <- merge(cbind(id_1),
            df_to_merge,
            by = 1, all.x = TRUE)

What I want to achieve is a data.frame like df_wanted below, where data are merged on the minimum difference between the id values id_1 and id_2:

df_intermediate <- data.frame(
  id_1 = c(id_1[c(5,44,89,123,145,200,298,397)]),
  var1 = df_to_merge$var1,
  var2 = df_to_merge$var2,
  var3 = df_to_merge$var3
  )

df_wanted <- merge(cbind(id_1),
                   df_intermediate,
                   by = 1, 
                   all.x = TRUE
                   )

> head(df_wanted)
       id_1   var1     var2     var3
1 0.3490118     NA       NA       NA
2 0.4685800     NA       NA       NA
3 0.8668648     NA       NA       NA
4 0.8937213     NA       NA       NA
5 1.8591077 28.705 3.279207 1.632198
6 2.7631707     NA       NA       NA

I went through many similar questions that somewhat got close to my problem (e.g. https://stackoverflow.com/a/16096226; https://discuss.analyticsvidhya.com/t/how-to-merge-datasets-on-nearest-values-in-r/570/4, yet, I didn't manage to achieve the result that I'm after.

Ideally, I would prefer a solution based on base R.

Many thanks in advance for any help and suggestions!

1 Answers1

0

Here is a data.table approach..

library( data.table )
#set as data.table
setDT(df_to_merge)
ans <- data.table( id_1 = id_1)
#introduce id's in new.dt, to prevent nasty floating point error later on
ans[, id := .I ]

#first, get nearest id1 to id_2
df_to_merge[, c("id_1", "id") := ans[ df_to_merge, .(x.id_1, x.id), on = .(id_1 = id_2), roll = "nearest" ] ]

#    id_2      var1     var2       var3       id_1  id
# 1:    2 13.126522 4.336312 1.25994881   1.859108   5
# 2:   20 28.140848 4.261751 0.06237255  19.710629  44
# 3:   45 29.640099 5.968544 1.14134391  45.540429  89
# 4:   60 13.689587 4.520856 1.26704528  59.979684 123
# 5:   72  6.918448 9.566087 0.57427819  72.225833 145
# 6:  100 20.864678 8.444304 2.11598785 101.247299 200
# 7:  150 16.698970 2.227686 0.04056143 149.933831 297
# 8:  200 17.541303 4.201702 2.19360932 199.479480 397

#now, join back... take care of floating point errors! by joining on id, not on id_1
#  (not sure if this is necessairy, but better safe than sorry)
#   drop the temporary id-col
ans[ df_to_merge, `:=`(var1 = i.var1, var2 = i.var2, var3 = i.var3 ), on = .(id) ][, id := NULL ]

#              id_1     var1     var2     var3
#    1:   0.3490118       NA       NA       NA
#    2:   0.4685800       NA       NA       NA
#    3:   0.8668648       NA       NA       NA
#    4:   0.8937213       NA       NA       NA
#    5:   1.8591077 13.12652 4.336312 1.259949
# ---                                       
# 1495: 747.3342805       NA       NA       NA
# 1496: 747.4629140       NA       NA       NA
# 1497: 749.1619868       NA       NA       NA
# 1498: 749.5533960       NA       NA       NA
# 1499: 749.6429831       NA       NA       NA
Wimpel
  • 26,031
  • 1
  • 20
  • 37