In the code below, I have a function called correct_admin_names that takes in main_data, shapefile_data, and the variable they have in common, var_by. The goal of this function is to correct the variable they have in common in main_data by matching it with shapefile_data. However, the length of the output from correct_admin_names() does not match the length of the original data given (it currently gives a length of 1402, but I expect it to be 1420). I want it to match the columns during recycling and not just repeat the values, as the main column is different throughout. Can someone help me fix this code?
# Load the required packages
library(dplyr)
library(cli)
library(stringi)
library(fuzzyjoin)
# create dummy data
main_data <- data.frame(
var2 = rep(1:10, each = 142),
adm2 = paste0("District_", 1:1420),
value = rnorm(1420)
)
# create shapefile data
shapefile_data <- data.frame(
var3 = rep(1:10, each = 10),
adm2 = paste0("District_", 1:100),
region = paste0("Region_", rep(1:10, each = 10))
)
# function to call
correct_admin_names <- function(main_data, shapefile_data, match_var,
method = c("jw"),
ignore_case = TRUE,
max_dist = 0.27) {
# to simplify text
text_simplify <- function(x) {
gsub(
"[[:punct:][:space:]]",
"",
tolower(stringi::stri_trans_general(x, "latin-ascii"))
)
}
# Mutate the shapefile_data dataset to create the variable to check
shapefile_data_mutated <- shapefile_data %>%
mutate(match_var_y = {{ match_var }})
# Select the common variable and the variable to check from shapefile_data
shapefile_data_select <- shapefile_data_mutated %>%
as.data.frame() %>%
select({{ match_var }}, match_var_y)
# Join the two datasets by the common variable
joined_data <- left_join(main_data, shapefile_data_select, by = match_var, multiple = "all")
# Filter for rows where the selected variable from shapefile_data is missing
missing_data <- joined_data %>%
filter(is.na(match_var_y)) %>% pull({{ match_var }}) %>% unique()
# Fuzzy match the missing data against the shapefile data
fuzzy_match <- missing_data %>%
# turn into a data.frame
as.data.frame() %>%
# simplify the text for matching
mutate(simplified_text = text_simplify(missing_data)) %>%
# rename the column to adm2
rename_with( ~ {{ match_var }}, 1) %>%
# perform fuzzy join using stringdist_left_join from fuzzyjoin package
stringdist_left_join(shapefile_data_select %>% mutate(simplified_text = text_simplify(!!sym(match_var))),
by = c("simplified_text"), ignore_case = ignore_case, max_dist = max_dist, method = method) %>%
# drop the rows that repeat
distinct() %>%
# drop the cols that repeat
select(main_data = .data[[paste0(match_var, ".x")]],
shape_data = .data[[paste0(match_var, ".y")]],
# simplified_match_main = .data[[paste0("simplified_text.x")]],
# simplified_match_shape = .data[[paste0("simplified_text.y")]]
) %>%
as_tibble() %>%
select(adm2 = main_data, shape_data)
# Replace the main_data with the corrected names from the lookup_table
corrected_main_data <- main_data %>%
as.data.frame() %>%
left_join(., fuzzy_match, by = {{ match_var }}, multiple = "all") %>%
select(shape_data)
return(corrected_main_data)
}
# Call function
main_data %>%
mutate(adm2_clean = correct_admin_names(. , shapefile_data, method = 'jaccard', match_var = 'adm2', max_dist = 0.19))