1

I'm trying to create a new dataframe that joins the names of some columns and their values.

My inputs looks like this:

input1 = structure(list(Date = structure(c(1677502800, 1677502800,
1677502800, 1677502800, 1677502800, 1677502800), class = c("POSIXct",
"POSIXt"), tzone = ""), Name = c("Rome_Italy", "Paris_France",
"Beijing_China", "Boston_USA", "Moscow_Russia",
"Sydney_Australia"), ReportType = c("SALES",
"SALES", "SALES", "SALES", "SALES", "SALES"), TestType = c("Internal",
"Internal", "Internal", "Internal", "Internal", "Internal"), Code1 = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), Code2 = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"
), Result1 = c("XMAS_DOWN", "XMAS_DOWN", "XMAS_DOWN", "XMAS_DOWN",
"XMAS_DOWN", "XMAS_DOWN"), Result2 = c("N/A", "N/A", "N/A", "N/A",
"N/A", "N/A"), Value = c(24, 6, 0, 9,
-7, -13)), row.names = c(NA, 6L), class = "data.frame")


input2 = structure(list(Date = structure(c(1677502800, 1677502800,
1677502800, 1677502800, 1677502800, 1677502800), class = c("POSIXct",
"POSIXt"), tzone = ""), Name = c("Rome_Italy", "Paris_France",
"Beijing_China", "Boston_USA", "Moscow_Russia",
"Sydney_Australia"), ReportType = c("SALES",
"SALES", "SALES", "SALES", "SALES", "SALES"), TestType = c("Internal",
"Internal", "Internal", "Internal", "Internal", "Internal"), Code1 = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), Code2 = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"
), Result1 = c("EAST_DOWN", "EAST_DOWN", "EAST_DOWN", "EAST_DOWN", "EAST_DOWN", "EAST_DOWN" ), Result2 = c("N/A", "N/A", "N/A", "N/A",
"N/A", "N/A"), Value = c(22, 2, 3, 2,
9, 16)), row.names = c(NA, 6L), class = "data.frame")

And my desired output is this:

output = structure(list(Date = structure(c(1677502800, 1677502800,
1677502800, 1677502800, 1677502800, 1677502800), class = c("POSIXct",
"POSIXt"), tzone = ""), Name = c("Rome_Italy", "Paris_France",
"Beijing_China", "Boston_USA", "Moscow_Russia",
"Sydney_Australia"), ReportType = c("SALES",
"SALES", "SALES", "SALES", "SALES", "SALES"), TestType = c("Internal",
"Internal", "Internal", "Internal", "Internal", "Internal"), Code1 = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"), Code2 = c("N/A",
"N/A", "N/A", "N/A", "N/A", "N/A"
), Result1 = c("XMAS_DOWN + EAST_DOWN", "XMAS_DOWN + EAST_DOWN", "XMAS_DOWN + EAST_DOWN", "XMAS_DOWN + EAST_DOWN", "XMAS_DOWN + EAST_DOWN", "XMAS_DOWN + EAST_DOWN" ), Result2 = c("N/A", "N/A", "N/A", "N/A",
"N/A", "N/A"), Value = c(46, 8, 3, 11,
2, 3)), row.names = c(NA, 6L), class = "data.frame")

I can get to this result using a join, summing the values, paste the Result1 together, and then selecting the columns I want to keep:

test = left_join(input1, input2, by = c('Date', 'Name', 'ReportType', 'TestType', 'Code1', 'Code2', 'Result2'))
test$Value = test$Value.x + test$Value.y
test$Result1 = paste(test$Result1.x, "+", test$Result1.y)
test_1 = select(test, c('Date', 'Name', 'ReportType', 'TestType', 'Code1', 'Code2', 'Result1', 'Result2', 'Value'))

But this feels like a really clumsy solution.

Can anyone suggest a better way of doing it?

M--
  • 25,431
  • 8
  • 61
  • 93

2 Answers2

0

You can bind_rows and then summarise where you group_by all columns except Result1 and Value:

bind_rows(input1, input2) %>%
  group_by(across(-c(Result1, Value)))  %>% 
  summarise(Value = sum(Value, na.rm = T), 
            Result1 = paste0(Result1, collapse = " + "), 
            .groups = "drop") %>% 
  select(names(input1))
# A tibble: 6 x 9
  Date                Name             ReportType TestType Code1 Code2 Result1               Result2 Value
  <dttm>              <chr>            <chr>      <chr>    <chr> <chr> <chr>                 <chr>   <dbl>
1 2023-02-27 08:00:00 Beijing_China    SALES      Internal N/A   N/A   XMAS_DOWN + EAST_DOWN N/A         3
2 2023-02-27 08:00:00 Boston_USA       SALES      Internal N/A   N/A   XMAS_DOWN + EAST_DOWN N/A        11
3 2023-02-27 08:00:00 Moscow_Russia    SALES      Internal N/A   N/A   XMAS_DOWN + EAST_DOWN N/A         2
4 2023-02-27 08:00:00 Paris_France     SALES      Internal N/A   N/A   XMAS_DOWN + EAST_DOWN N/A         8
5 2023-02-27 08:00:00 Rome_Italy       SALES      Internal N/A   N/A   XMAS_DOWN + EAST_DOWN N/A        46
6 2023-02-27 08:00:00 Sydney_Australia SALES      Internal N/A   N/A   XMAS_DOWN + EAST_DOWN N/A         3
M--
  • 25,431
  • 8
  • 61
  • 93
0

This is great use case for {powerjoin}:

powerjoin::power_left_join(
  input1, input2, 
  by = c('Date', 'Name', 'ReportType', 'TestType', 'Code1', 'Code2', 'Result2'),
  conflict = list(Result1 = ~paste(.x, "+", .y), Value = `+`)
)
#>                  Date             Name ReportType TestType Code1 Code2 Result2
#> 1 2023-02-27 14:00:00       Rome_Italy      SALES Internal   N/A   N/A     N/A
#> 2 2023-02-27 14:00:00     Paris_France      SALES Internal   N/A   N/A     N/A
#> 3 2023-02-27 14:00:00    Beijing_China      SALES Internal   N/A   N/A     N/A
#> 4 2023-02-27 14:00:00       Boston_USA      SALES Internal   N/A   N/A     N/A
#> 5 2023-02-27 14:00:00    Moscow_Russia      SALES Internal   N/A   N/A     N/A
#> 6 2023-02-27 14:00:00 Sydney_Australia      SALES Internal   N/A   N/A     N/A
#>                 Result1 Value
#> 1 XMAS_DOWN + EAST_DOWN    46
#> 2 XMAS_DOWN + EAST_DOWN     8
#> 3 XMAS_DOWN + EAST_DOWN     3
#> 4 XMAS_DOWN + EAST_DOWN    11
#> 5 XMAS_DOWN + EAST_DOWN     2
#> 6 XMAS_DOWN + EAST_DOWN     3

Created on 2023-03-17 with reprex v2.0.2

moodymudskipper
  • 46,417
  • 11
  • 121
  • 167