You can try this function, it can create the first and last flag and deal with NA
like sas.
R dplyr::arrange
: NA are always sorted to the end for local data, even
when wrapped with dplyr::desc().
SAS PROC SORT
: a missing value for a numeric variable is smaller than all numbers.
Missing values of character variables are smaller than any printable character value.
The function use to sort data like SAS that the missing value is smallest, and create variable first and last for each sort
variable.
library(dplyr, warn.conflicts = FALSE)
#' Sort data rows and create first and last like SAS
#'
#' @param data input data
#' @param ... variables for sort
#' @param first_last logical value, whether or not create the first and last variables
#' @param first_prefix character string of prefix for creating first variable
#' @param last_prefix character string of prefix for creating last variable
sas_sort <- function(data, ...,
first_last = TRUE,
first_prefix = ".first.",
last_prefix = ".last.") {
stopifnot(!missing(data), is.data.frame(data))
if (dplyr::is.grouped_df(data)) {
message(
"the input data is grouped by `",
dplyr::group_vars(data),
"`, and wil be ungrouped "
)
data <- data %>% dplyr::ungroup()
}
dots <- rlang::enexprs(...)
if (length(dots) == 0) {
stop("argumetn `...` is empty")
}
sort <- vector("list")
for (i in seq_along(dots)) {
dot <- dots[[i]]
dot_str <- deparse(dot)
if (stringr::str_detect(dot_str, "((dplyr::)?desc\\()(.+)(\\))")) {
sort <- append(sort, dot)
} else {
sort <- append(sort, rlang::parse_expr(paste0("!is.na(", dot_str, ")")))
sort <- append(sort, dot)
}
}
data <- data %>% dplyr::arrange(!!!sort)
if (first_last) {
for (i in seq_along(dots)) {
dot <- dots[[i]]
dot_str <- deparse(dot)
if (stringr::str_detect(dot_str, "((dplyr::)?desc\\()(.+)(\\))")) {
dot_str <- stringr::str_extract(dot_str, "((dplyr::)?desc\\()(.+)(\\))", group = 3)
dot <- rlang::sym(dot_str)
}
first <- paste0(first_prefix, dot_str)
last <- paste0(last_prefix, dot_str)
data <- data %>%
dplyr::group_by(!!dot, .add = TRUE) %>%
dplyr::arrange(!!!sort) %>%
dplyr::mutate(
!!first := dplyr::row_number() == 1L,
!!last := dplyr::row_number() == dplyr::n()
)
}
}
data %>% dplyr::ungroup()
}
# this data is from SAS Programmer’s Guide: Essentials
# FIRST. and LAST. DATA Step Variables
# Example 1: Grouping Observations by State, City, and ZIP Code
zip <- tibble::tribble(
~State, ~City, ~ZipCode, ~Street,
"AZ", "Tucson", 85730L, "Domenic Ln",
"AZ", "Tucson", 85730L, "Gleeson Pl",
"FL", "Lakeland", 33801L, "French Ave",
"FL", "Lakeland", 33809L, "Egret Dr",
"FL", "Miami", 33133L, "Rice St",
"FL", "Miami", 33133L, "Thomas Ave",
"FL", "Miami", 33133L, "Surrey Dr",
"FL", "Miami", 33133L, "Trade Ave",
"FL", "Miami", 33146L, "Nervia St",
"FL", "Miami", 33146L, "Corsica St"
)
zip_sort_r <- sas_sort(zip, State, City, ZipCode,
first_prefix = "first_",
last_prefix = "last_")
zip_sort_r
#> # A tibble: 10 × 10
#> State City ZipCode Street first_State last_State first_City last_City
#> <chr> <chr> <int> <chr> <lgl> <lgl> <lgl> <lgl>
#> 1 AZ Tucson 85730 Domenic Ln TRUE FALSE TRUE FALSE
#> 2 AZ Tucson 85730 Gleeson Pl FALSE TRUE FALSE TRUE
#> 3 FL Lakeland 33801 French Ave TRUE FALSE TRUE FALSE
#> 4 FL Lakeland 33809 Egret Dr FALSE FALSE FALSE TRUE
#> 5 FL Miami 33133 Rice St FALSE FALSE TRUE FALSE
#> 6 FL Miami 33133 Thomas Ave FALSE FALSE FALSE FALSE
#> 7 FL Miami 33133 Surrey Dr FALSE FALSE FALSE FALSE
#> 8 FL Miami 33133 Trade Ave FALSE FALSE FALSE FALSE
#> 9 FL Miami 33146 Nervia St FALSE FALSE FALSE FALSE
#> 10 FL Miami 33146 Corsica St FALSE TRUE FALSE TRUE
#> # ℹ 2 more variables: first_ZipCode <lgl>, last_ZipCode <lgl>
df <- tibble::tribble(
~x, ~y, ~z,
"b", 2L, NA,
NA, 1L, NA,
NA, 2L, NA,
NA, NA, NA,
NA, NA, "a",
"a", NA, "a",
"a", 1L, "a",
"a", 2L, "b",
"b", NA, NA,
"b", 1L, "b",
"a", NA, NA,
NA, 1L, "b",
"b", NA, "b",
"a", 2L, "a",
"b", 2L, "b",
NA, 2L, "b",
NA, 1L, "a",
"b", 1L, NA,
"a", NA, "b",
"b", NA, "a",
"a", 2L, NA,
"a", 1L, "b",
"a", 1L, NA,
"b", 1L, "a",
"b", 2L, "a",
NA, NA, "b",
NA, 2L, "a"
)
sort1 <- sas_sort(df,x,y,z)
sort1
#> # A tibble: 27 × 9
#> x y z .first.x .last.x .first.y .last.y .first.z .last.z
#> <chr> <int> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 <NA> NA <NA> TRUE FALSE TRUE FALSE TRUE TRUE
#> 2 <NA> NA a FALSE FALSE FALSE FALSE TRUE TRUE
#> 3 <NA> NA b FALSE FALSE FALSE TRUE TRUE TRUE
#> 4 <NA> 1 <NA> FALSE FALSE TRUE FALSE TRUE TRUE
#> 5 <NA> 1 a FALSE FALSE FALSE FALSE TRUE TRUE
#> 6 <NA> 1 b FALSE FALSE FALSE TRUE TRUE TRUE
#> 7 <NA> 2 <NA> FALSE FALSE TRUE FALSE TRUE TRUE
#> 8 <NA> 2 a FALSE FALSE FALSE FALSE TRUE TRUE
#> 9 <NA> 2 b FALSE TRUE FALSE TRUE TRUE TRUE
#> 10 a NA <NA> TRUE FALSE TRUE FALSE TRUE TRUE
#> # ℹ 17 more rows
sort2 <- sas_sort(df, x, dplyr::desc(y), z)
sort2
#> # A tibble: 27 × 9
#> x y z .first.x .last.x .first.y .last.y .first.z .last.z
#> <chr> <int> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
#> 1 <NA> 2 <NA> TRUE FALSE TRUE FALSE TRUE TRUE
#> 2 <NA> 2 a FALSE FALSE FALSE FALSE TRUE TRUE
#> 3 <NA> 2 b FALSE FALSE FALSE TRUE TRUE TRUE
#> 4 <NA> 1 <NA> FALSE FALSE TRUE FALSE TRUE TRUE
#> 5 <NA> 1 a FALSE FALSE FALSE FALSE TRUE TRUE
#> 6 <NA> 1 b FALSE FALSE FALSE TRUE TRUE TRUE
#> 7 <NA> NA <NA> FALSE FALSE TRUE FALSE TRUE TRUE
#> 8 <NA> NA a FALSE FALSE FALSE FALSE TRUE TRUE
#> 9 <NA> NA b FALSE TRUE FALSE TRUE TRUE TRUE
#> 10 a 2 <NA> TRUE FALSE TRUE FALSE TRUE TRUE
#> # ℹ 17 more rows
# delete the first and last
delete_first_last <- sas_sort(df, x, dplyr::desc(y), z) %>%
dplyr::select(
-dplyr::starts_with(".first."),
-dplyr::starts_with(".last.")
)
delete_first_last
#> # A tibble: 27 × 3
#> x y z
#> <chr> <int> <chr>
#> 1 <NA> 2 <NA>
#> 2 <NA> 2 a
#> 3 <NA> 2 b
#> 4 <NA> 1 <NA>
#> 5 <NA> 1 a
#> 6 <NA> 1 b
#> 7 <NA> NA <NA>
#> 8 <NA> NA a
#> 9 <NA> NA b
#> 10 a 2 <NA>
#> # ℹ 17 more rows
Created on 2023-07-19 with reprex v2.0.2