I have found that with a lot of data, the stringdist function can get bogged down. So, if you run into issues with speed, there are other options for packages (e.g., the RecordLinkage
package, agrep
), and other methods for matching strings (i.e., other measures of distance). Also, it is not 100% clear what you are asking, but if your issue is that you want to test to flip first and last names you could always use strsplit.
For example,
> library(stringdist)
>
> #Table A
> word <- c("PILLAY NOLAN VICTOR", "PILLAY NICHOLAS")
> #Master Table
> choices <- c("IGOR JOSE VICTOR","WILLIAM NICHOLAS","NOLAN PILLAY","NICHOLAS PILLAY")
>
> # Try # 1
> match_dist <- sapply(word,
+ function(x) min(stringdist(x, choices, method = "lv")))
>
> match_text <- sapply(word,
+ function(x) choices[which.min(stringdist(x, choices, method = "lv"))])
>
> df <- data.frame("traveler name" = word,
+ "people name" = match_text,
+ "dist" = match_dist, stringsAsFactors = FALSE, row.names = NULL)
> # Checking results
> df
traveler.name people.name dist
1 PILLAY NOLAN VICTOR IGOR JOSE VICTOR 9
2 PILLAY NICHOLAS WILLIAM NICHOLAS 3
>
>
> # Reversing srings, assuming names are sepearated by a space
> reversed <- sapply(strsplit(choices, " "), function(x) paste(rev(x), collapse=" ")) #reversing words
> choices <- c(choices, reversed)
> choices <- unique(choices)
>
>
> # Try # 2
> match_dist <- sapply(word,
+ function(x) min(stringdist(x, choices, method = "lv")))
>
> match_text <- sapply(word,
+ function(x) choices[which.min(stringdist(x, choices, method = "lv"))])
>
> df <- data.frame("traveler name" = word,
+ "people name" = match_text,
+ "dist" = match_dist, stringsAsFactors = FALSE, row.names = NULL)
>
> # Checking the new results
> df
traveler.name people.name dist
1 PILLAY NOLAN VICTOR PILLAY NOLAN 7
2 PILLAY NICHOLAS PILLAY NICHOLAS 0
Depending on how your data is set up you may find it helpful (or not) to get rid of middle names, or clean the data in other ways but this should get your started.
EDIT:
I tested a couple different solutions but did not test agrep
so that may be worth checking out. I would definitely favor RecordLinkage
and I would even consider breaking up your data set into perfect matches and non-matches, and then only reversing (or sorting) the non-matches. The code will bottleneck calculating the measure of distance, so anything to reduce the number of names needing a measure of distance will probably help you.
> library(stringdist)
> library(RecordLinkage)
> library(microbenchmark)
>
> #Table A
> word <- c("PILLAY NOLAN VICTOR", "PILLAY NICHOLAS", "WILLIAM NICHOLAS")
> #Master Table
> choices <- c("IGOR JOSE VICTOR","WILLIAM NICHOLAS","NOLAN PILLAY","NICHOLAS PILLAY")
>
> microbenchmark({
+
+ # All reversed
+ reversed <- sapply(strsplit(choices, " "), function(x) paste(rev(x), collapse=" ")) #reversing words
+ choices1 <- c(choices, reversed)
+ choices1 <- unique(choices1)
+
+ match_dist <- sapply(word, function(x) min(stringdist(x, choices1, method = "lv")))
+ match_text <- sapply(word, function(x) choices1[which.min(stringdist(x, choices1, method = "lv"))])
+
+ df1 <- data.frame("traveler name" = word,
+ "people name" = match_text,
+ "dist" = match_dist,
+ stringsAsFactors = FALSE, row.names = NULL)
+ },
+
+ {
+ # Record linkage
+ reversed <- sapply(strsplit(choices, " "), function(x) paste(rev(x), collapse=" ")) #reversing words
+ choices2 <- c(choices, reversed)
+ choices2 <- unique(choices2)
+
+ match_dist2 <- sapply(word, function(x) min(levenshteinDist(x, choices2)))
+ match_text2 <- sapply(word, function(x) choices2[which.min(levenshteinDist(x, choices2))])
+
+ df2 <- data.frame("traveler name" = word,
+ "people name" = match_text2,
+ "dist" = match_dist2,
+ stringsAsFactors = FALSE, row.names = NULL)
+ },
+
+ {
+ # Sorted
+
+ sorted <- sapply(strsplit(choices, " "), function(x) paste(sort(x), collapse=" ")) #sorting choices
+ choices3 <- c(choices, sorted)
+ choices3 <- unique(choices3)
+ word3 <- sapply(strsplit(word, " "), function(x) paste(sort(x), collapse=" ")) #sorting words
+
+ match_dist3 <- sapply(word3, function(x) min(stringdist(x, choices3, method = "lv")))
+ match_text3 <- sapply(word3, function(x) choices3[which.min(stringdist(x, choices3, method = "lv"))])
+
+ df3 <- data.frame("traveler name" = word3,
+ "people name" = match_text3,
+ "dist" = match_dist3,
+ stringsAsFactors = FALSE, row.names = NULL)
+ },
+ times = 1)
Unit: milliseconds
expr min lq mean median uq max neval
revers 6.627258 6.627258 6.627258 6.627258 6.627258 6.627258 1
reversRL 4.016632 4.016632 4.016632 4.016632 4.016632 4.016632 1
sort 7.223453 7.223453 7.223453 7.223453 7.223453 7.223453 1
>
> all.equal(df1, df2)
[1] TRUE
>
> df2
traveler.name people.name dist
1 PILLAY NOLAN VICTOR PILLAY NOLAN 7
2 PILLAY NICHOLAS PILLAY NICHOLAS 0
3 WILLIAM NICHOLAS WILLIAM NICHOLAS 0
> df3
traveler.name people.name dist
1 NOLAN PILLAY VICTOR NOLAN PILLAY 7
2 NICHOLAS PILLAY NICHOLAS PILLAY 0
3 NICHOLAS WILLIAM NICHOLAS WILLIAM 0