I am essentially trying to join two sets of data. However, the data has inexact matches (e.g., a date in the main data that is between a start and stop date in the lookup table). I know that fuzzy_join works, but I cannot use it due to memory space, which has forced me to use for loops. The data set continues to grow (> 3,000,000 rows) and for loops are now taking multiple days to run, so I am trying to find a faster way to index the rows in the lookup table.
The following is a very simplified version of what I am currently doing. Sorry that the data is a bit non-sensical (it was what I could quickly come up with for a fake data set).
I would be most happy with a way to use the multi-contitional which statment without useing a loop--even if it just outputs the row number(s) of the lookup table that match the row of interest in the main data set.
df<-cars# main data
#Creating matching data for a lookup table
lookup<-cars[-which(duplicated(cars)),]
lookup$location<-state.abb[-1]
#linking the main data to the lookup table analogous to a left_join
for (i in 1:nrow(df)) {
row <- which(lookup[,"speed"] == df[i,"speed"] &
lookup[,"dist"]==df[i,"dist"])
if (length(row)==1){df[i,"location"]<-lookup[row,"location"]}
else if (length(row)!=1){df[i,"location"]<-"Error"}
}
The following is an example where the data more closely matches the question as per another users request.
Lookup<-structure(list(Deploy = structure(c(18575, 18575, 18575, 18575,
18575, 18464, 18573, 18573, 18573), class = "Date"), Retrieved = structure(c(18978,
18978, 18978, 18978, 18978, 18978, 18978, 18978, 18978), class = "Date"),
Station = c("484451", "484449", "484450", "484452", "484453",
"483231", "484457", "484456", "484455"), Name = c("A", "B",
"C", "D", "E", "F", "G", "H", "I")), row.names = c(7L, 8L,
9L, 10L, 11L, 16L, 27L, 33L, 34L), class = "data.frame")
df<-structure(list(DateTime = c("2020-11-07 14:48:17 CST", "2021-04-24 12:31:50 CDT",
"2021-04-28 00:03:56 CDT", "2021-04-30 21:53:25 CDT", "2021-05-08 08:22:37 CDT",
"2021-05-14 13:01:15 CDT", "2021-05-20 02:29:43 CDT", "2021-05-24 13:54:15 CDT",
"2021-05-30 04:11:33 CDT", "2021-06-03 03:38:52 CDT", "2021-06-06 01:40:47 CDT",
"2021-06-09 04:05:07 CDT", "2021-06-13 22:55:06 CDT", "2021-06-18 15:19:01 CDT",
"2021-06-30 01:12:33 CDT", "2021-08-13 19:04:24 CDT", "2021-09-07 19:41:25 CDT",
"2021-11-19 19:36:32 CST", "2021-12-02 19:37:50 CST", "2021-03-31 07:50:39 CDT",
"2021-04-16 02:25:56 CDT", "2021-04-26 23:12:17 CDT", "2021-05-03 15:00:23 CDT",
"2021-05-12 05:50:45 CDT", "2021-05-18 15:00:19 CDT", "2021-05-28 22:52:07 CDT",
"2021-06-05 15:29:18 CDT", "2021-06-12 11:18:47 CDT", "2021-06-19 04:10:22 CDT",
"2021-06-30 10:11:13 CDT", "2021-08-14 08:21:32 CDT", "2021-06-28 15:54:43 CDT",
"2021-07-04 12:44:51 CDT", "2021-02-14 22:28:41 CST", "2021-03-14 15:14:28 CDT",
"2021-04-16 05:10:42 CDT", "2021-04-20 00:52:10 CDT", "2021-04-23 06:11:06 CDT",
"2021-04-26 16:45:59 CDT", "2021-04-30 03:43:53 CDT", "2021-05-03 11:11:58 CDT",
"2021-05-07 15:27:46 CDT", "2021-05-11 05:51:15 CDT", "2021-05-14 06:06:39 CDT",
"2021-05-17 07:08:14 CDT", "2021-05-20 08:26:21 CDT", "2021-05-23 16:48:55 CDT",
"2021-05-27 23:53:04 CDT", "2021-05-31 17:05:24 CDT", "2021-06-03 21:19:24 CDT",
"2021-06-07 00:20:27 CDT", "2021-06-10 16:06:36 CDT", "2021-06-13 20:26:05 CDT",
"2021-06-17 04:28:53 CDT", "2021-06-20 15:26:44 CDT", "2021-06-24 05:24:39 CDT",
"2021-06-28 13:57:09 CDT", "2021-07-02 12:13:23 CDT", "2021-07-05 23:31:31 CDT",
"2021-07-09 06:34:07 CDT", "2021-07-13 17:30:26 CDT", "2021-07-17 19:38:44 CDT",
"2021-07-21 17:24:29 CDT", "2021-07-26 19:54:42 CDT", "2021-07-31 15:53:39 CDT",
"2021-08-05 05:57:23 CDT", "2021-08-09 14:34:04 CDT", "2021-08-13 20:34:30 CDT",
"2021-08-17 21:23:28 CDT", "2021-08-21 04:37:26 CDT", "2021-08-24 03:23:49 CDT",
"2021-08-28 11:22:34 CDT", "2021-08-31 23:20:26 CDT", "2021-09-08 03:54:07 CDT",
"2021-09-11 23:29:38 CDT", "2021-09-16 00:53:41 CDT", "2021-09-25 02:26:09 CDT",
"2021-09-29 22:28:42 CDT", "2021-10-05 03:27:17 CDT", "2021-10-08 23:44:34 CDT",
"2021-10-12 11:56:38 CDT", "2021-10-20 02:28:34 CDT", "2021-11-05 19:02:55 CDT",
"2021-03-09 05:58:50 CST", "2021-04-13 16:46:42 CDT", "2021-04-18 14:34:43 CDT",
"2021-04-25 08:38:19 CDT", "2021-05-06 00:53:20 CDT", "2021-05-12 16:45:31 CDT",
"2021-05-21 23:56:06 CDT", "2021-06-05 20:38:45 CDT", "2021-06-16 14:57:06 CDT",
"2021-07-05 14:54:37 CDT", "2021-07-25 23:25:06 CDT", "2021-08-15 11:22:27 CDT",
"2021-08-31 21:25:10 CDT", "2021-10-03 18:53:57 CDT", "2021-11-08 16:03:34 CST",
"2021-02-16 01:15:13 CST", "2021-03-06 00:28:03 CST"), Station = c("483231",
"484449", "484449", "484449", "484449", "484449", "484449", "484449",
"484449", "484449", "484449", "484449", "484449", "484449", "484449",
"484449", "484449", "484449", "484449", "484450", "484450", "484450",
"484450", "484450", "484450", "484450", "484450", "484450", "484450",
"484450", "484450", "484451", "484453", "484455", "484455", "484455",
"484455", "484455", "484455", "484455", "484455", "484455", "484455",
"484455", "484455", "484455", "484455", "484455", "484455", "484455",
"484455", "484455", "484455", "484455", "484455", "484455", "484455",
"484455", "484455", "484455", "484455", "484455", "484455", "484455",
"484455", "484455", "484455", "484455", "484455", "484455", "484455",
"484455", "484455", "484455", "484455", "484455", "484455", "484455",
"484455", "484455", "484455", "484455", "484455", "484456", "484456",
"484456", "484456", "484456", "484456", "484456", "484456", "484456",
"484456", "484456", "484456", "484456", "484456", "484456", "484457",
"484457"), ID = c("26382", "52448", "52448", "52448", "59597",
"52481", "59685", "59597", "59625", "52448", "59607", "59597",
"52476", "52452", "52443", "52490", "52452", "59607", "52484",
"52478", "52459", "52476", "52452", "52456", "52448", "52476",
"52490", "52461", "52452", "59597", "52488", "52481", "52465",
"52430", "52434", "52553", "52545", "52433", "52497", "52439",
"52535", "59617", "59620", "59686", "52553", "54225", "54225",
"50313", "50313", "52549", "52549", "59620", "28726", "52436",
"56724", "59683", "52436", "50315", "59683", "59684", "52555",
"52431", "28723", "59683", "52431", "52431", "52497", "52559",
"52497", "52541", "52523", "59684", "52436", "59683", "59684",
"59684", "52431", "59617", "52436", "52436", "52555", "52436",
"59684", "52497", "59617", "52522", "52497", "52549", "52535",
"59686", "52496", "52436", "59617", "52497", "59682", "52436",
"59683", "59685", "52430", "52438")), class = "data.frame", row.names = c(15682L,
113503L, 123508L, 133522L, 143529L, 153531L, 163546L, 173553L,
183563L, 193570L, 203578L, 213582L, 223586L, 233588L, 243592L,
253608L, 263609L, 279539L, 291018L, 308142L, 320585L, 332244L,
343277L, 354611L, 365588L, 377103L, 388219L, 399208L, 410181L,
421791L, 438325L, 490101L, 570616L, 581633L, 595778L, 610533L,
621113L, 631630L, 642195L, 652737L, 663238L, 674401L, 685098L,
695576L, 706043L, 716540L, 727054L, 737734L, 748313L, 758811L,
769297L, 779851L, 790361L, 800892L, 811420L, 821980L, 832634L,
843246L, 853770L, 864265L, 874930L, 885540L, 896129L, 906876L,
917594L, 928287L, 938949L, 949584L, 960180L, 970680L, 981116L,
991790L, 1002945L, 1014113L, 1025331L, 1036973L, 1049346L, 1060826L,
1072594L, 1084158L, 1095904L, 1108937L, 1123610L, 1142055L, 1157723L,
1168503L, 1179545L, 1191392L, 1202620L, 1214257L, 1226959L, 1238952L,
1252276L, 1265805L, 1279392L, 1292293L, 1308823L, 1328560L, 1342239L,
1354986L))
for (i in 1:nrow(df)){
Row<-which(Lookup[,"Station"]==df[i,"Station"] &
Lookup[,"Deploy"]<=df[i,"DateTime"] &
Lookup[,"Retrieved"]>=df[i,"DateTime"])
if (length(Row)==1){df[i,"Location"]<-Lookup[Row,"Name"]}
else if (length(Row)!=1){df[i,"Location"]<-"Error"}
}