8

I have a dataframe 'df' that has categorical and POSIXct columns. The data look like:

Category DateTime
A 2022-08-29 00:00:00
A 2022-08-29 00:00:00
A 1 2022-08-29 00:00:00
A 1 2022-08-29 00:00:00
A 1 2022-08-29 00:00:00
B 2022-08-29 00:00:00
B 2022-08-29 00:00:00
B 2022-08-29 00:00:00
B 1 2022-08-29 00:00:00
B 1 2022-08-29 00:00:00
B 1 2022-08-29 00:00:00
B 1 2022-08-29 00:00:00
B 1 2022-08-29 00:00:00
A 2022-08-29 02:00:00
A 1 2022-08-29 02:00:00
B 2022-08-29 02:00:00
B 2022-08-29 02:00:00
B 2022-08-29 02:00:00
B 1 2022-08-29 02:00:00
B 1 2022-08-29 02:00:00
B 1 2022-08-29 02:00:00

I would like to create a new dataframe 'df2' that counts each unique value in column 'Category' that doesn't end with " 1" by column 'DateTime', so that the results look like:

Category DateTime CatCount
A 2022-08-29 00:00:00 2
B 2022-08-29 00:00:00 3
A 2022-08-29 02:00:00 1
B 2022-08-29 02:00:00 3
Maël
  • 45,206
  • 3
  • 29
  • 67
Jacob
  • 329
  • 2
  • 10

6 Answers6

4

We could filter output the rows having 1, then do a count

library(dplyr)
library(stringr)
df1 %>%
   filter(str_detect(Category, "\\s+1", negate = TRUE)) %>%
   count(Category, DateTime, name = "CatCount")

-output

 Category            DateTime CatCount
1        A 2022-08-29 00:00:00        2
2        A 2022-08-29 02:00:00        1
3        B 2022-08-29 00:00:00        3
4        B 2022-08-29 02:00:00        3

data

df1 <- structure(list(Category = c("A", "A", "A 1", "A 1", "A 1", "B", 
"B", "B", "B 1", "B 1", "B 1", "B 1", "B 1", "A", "A 1", "B", 
"B", "B", "B 1", "B 1", "B 1"), DateTime = c("2022-08-29 00:00:00", 
"2022-08-29 00:00:00", "2022-08-29 00:00:00", "2022-08-29 00:00:00", 
"2022-08-29 00:00:00", "2022-08-29 00:00:00", "2022-08-29 00:00:00", 
"2022-08-29 00:00:00", "2022-08-29 00:00:00", "2022-08-29 00:00:00", 
"2022-08-29 00:00:00", "2022-08-29 00:00:00", "2022-08-29 00:00:00", 
"2022-08-29 02:00:00", "2022-08-29 02:00:00", "2022-08-29 02:00:00", 
"2022-08-29 02:00:00", "2022-08-29 02:00:00", "2022-08-29 02:00:00", 
"2022-08-29 02:00:00", "2022-08-29 02:00:00")),
class = "data.frame", row.names = c(NA, 
-21L))
akrun
  • 874,273
  • 37
  • 540
  • 662
4

Another base R solution with transform + aggregate:

transform(df1[!grepl("1$", df1$Category),], count = 1) |>
  aggregate(count ~ Category + DateTime, data = _, length)

  Category            DateTime count
1        A 2022-08-29 00:00:00     2
2        B 2022-08-29 00:00:00     3
3        A 2022-08-29 02:00:00     1
4        B 2022-08-29 02:00:00     3
Maël
  • 45,206
  • 3
  • 29
  • 67
  • 2
    Also `aggregate(count ~ ., data = _, length)` will work instead of `aggregate(count ~ Category + DateTime, data = _, length)` – GKi Aug 30 '22 at 15:25
3
library(dplyr)
your_data %>%
  filter(!endsWith(Category, "1")) %>%
  count(Category, DateTime)
Gregor Thomas
  • 136,190
  • 20
  • 167
  • 294
3

Or a base solution using table:

df[nchar(df$Category) == 1,] |>
  table() |>
  as.data.frame(responseName = "CatCount")

We could of course subset in various ways df[!endsWith(df$Category, "1"),] (as @ Gregor Thomas) or df[!grepl("\\s+1", df$Category),] (as @ akrun).

Output:

  Category            DateTime CatCount
1        A 2022-08-29 00:00:00        2
2        B 2022-08-29 00:00:00        3
3        A 2022-08-29 02:00:00        1
4        B 2022-08-29 02:00:00        3

Data:

library(readr)

df <- read_delim("Category,DateTime
A,2022-08-29 00:00:00
A,2022-08-29 00:00:00
A 1,2022-08-29 00:00:00
A 1,2022-08-29 00:00:00
A 1,2022-08-29 00:00:00
B,2022-08-29 00:00:00
B,2022-08-29 00:00:00
B,2022-08-29 00:00:00
B 1,2022-08-29 00:00:00
B 1,2022-08-29 00:00:00
B 1,2022-08-29 00:00:00
B 1,2022-08-29 00:00:00
B 1,2022-08-29 00:00:00
A,2022-08-29 02:00:00
A 1,2022-08-29 02:00:00
B,2022-08-29 02:00:00
B,2022-08-29 02:00:00
B,2022-08-29 02:00:00
B 1,2022-08-29 02:00:00
B 1,2022-08-29 02:00:00
B 1,2022-08-29 02:00:00", delim = ",")

Update: Data added.

harre
  • 7,081
  • 2
  • 16
  • 28
2

Here is a data.table option, where we can use grepl (or could also use stringr) to ignore any rows that contain a number in Category, then count using .N.

library(data.table)

setDT(dt)[!grepl("\\d", Category), .N, .(Category, DateTime)]

Output

   Category            DateTime N
1:        A 2022-08-29 00:00:00 2
2:        B 2022-08-29 00:00:00 3
3:        A 2022-08-29 02:00:00 1
4:        B 2022-08-29 02:00:00 3

Data

dt <- structure(list(Category = c("A", "A", "A 1", "A 1", "A 1", "B", 
"B", "B", "B 1", "B 1", "B 1", "B 1", "B 1", "A", "A 1", "B", 
"B", "B", "B 1", "B 1", "B 1"), DateTime = c("2022-08-29 00:00:00", 
"2022-08-29 00:00:00", "2022-08-29 00:00:00", "2022-08-29 00:00:00", 
"2022-08-29 00:00:00", "2022-08-29 00:00:00", "2022-08-29 00:00:00", 
"2022-08-29 00:00:00", "2022-08-29 00:00:00", "2022-08-29 00:00:00", 
"2022-08-29 00:00:00", "2022-08-29 00:00:00", "2022-08-29 00:00:00", 
"2022-08-29 02:00:00", "2022-08-29 02:00:00", "2022-08-29 02:00:00", 
"2022-08-29 02:00:00", "2022-08-29 02:00:00", "2022-08-29 02:00:00", 
"2022-08-29 02:00:00", "2022-08-29 02:00:00")), class = "data.frame", row.names = c(NA, 
-21L))
AndrewGB
  • 16,126
  • 5
  • 18
  • 49
2

We may conveniently use the subset argument of aggregate.

aggregate(cbind(CatCount=rep(1, length(Category))) ~ Category + DateTime, df1, length, 
          subset=!grepl('1', Category))
#   Category            DateTime CatCount
# 1        A 2022-08-29 00:00:00        2
# 2        B 2022-08-29 00:00:00        3
# 3        A 2022-08-29 02:00:00        1
# 4        B 2022-08-29 02:00:00        3

Data borrowed from @akrun.

jay.sf
  • 60,139
  • 8
  • 53
  • 110