10

I have data in the following format for people punching their work times in:

(dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
                Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
                Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
                Time_Out = c("2:30pm","3:33pm","3:00am","1:00am")))

    Date          Person Time_In Time_Out
1 1/1/19        John Doe  1:15pm   2:30pm
2 1/2/19     Brian Smith  1:45am   3:33pm
3 1/4/19        Jane Doe  3:00pm   3:00am
4 1/2/19 Alexandra Wakes  1:00am   1:00am

I am looking to write a function in R or Python that will extract the total number of hours each person worked into 24 different buckets with each bucket as its own column. It would look something like this:

enter image description here

So in the first case, the person worked from 1:15pm to 2:30 pm, so they worked .75 hours from 1pm to 2pm (13-14), and .5 hours from 2pm to 3pm (14-15).

Some things I think may work are...

  1. A series of nested loops
  2. A long series of if/then statements
  3. Some function in Tidyverse or Pandas that I have not thought of yet.

Attempts from #1 and #2 (?) from above were utter failures. Not sure what the workflow is but any advice is much appreciated.

Note that the columns in the resulting table need notbe numbers (could be hour 1, hour 2, etc. or just any factor in general -as long as it represents a 24 hour period of time).

My past attempts have included nested for loops like the following:

for (i in 1:nrow(data)){

  if((int_overlaps(createinterval(data$PunchDate[i],0,1), workinterval[i]))){ `0-1`[i]=1} else{ `0-1`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],1,2), workinterval[i]))){ `1-2`[i]=1} else{ `1-2`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],2,3), workinterval[i]))){ `2-3`[i]=1} else{ `2-3`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],3,4), workinterval[i]))){ `3-4`[i]=1} else{ `3-4`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],4,5), workinterval[i]))){ `4-5`[i]=1} else{ `4-5`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],5,6), workinterval[i]))){ `5-6`[i]=1} else{ `5-6`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],6,7), workinterval[i]))){ `6-7`[i]=1} else{ `6-7`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],7,8), workinterval[i]))){ `7-8`[i]=1} else{ `7-8`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],8,9), workinterval[i]))){ `8-9`[i]=1} else{ `8-9`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],9,10), workinterval[i]))){ `9-10`[i]=1} else{ `9-10`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],10,11), workinterval[i]))){ `10-11`[i]=1} else{ `10-11`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],11,12), workinterval[i]))){ `11-12`[i]=1} else{ `11-12`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],12,13), workinterval[i]))){ `12-13`[i]=1} else{ `12-13`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],13,14), workinterval[i]))){ `13-14`[i]=1} else{ `13-14`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],14,15), workinterval[i]))){ `14-15`[i]=1} else{ `14-15`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],15,16), workinterval[i]))){ `15-16`[i]=1} else{ `15-16`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],16,17), workinterval[i]))){ `16-17`[i]=1} else{ `16-17`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],17,18), workinterval[i]))){ `17-18`[i]=1} else{ `17-18`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],18,19), workinterval[i]))){ `18-19`[i]=1} else{ `18-19`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],19,20), workinterval[i]))){ `19-20`[i]=1} else{ `19-20`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],20,21), workinterval[i]))){ `20-21`[i]=1} else{ `20-21`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],21,22), workinterval[i]))){ `21-22`[i]=1} else{ `21-22`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],22,23), workinterval[i]))){ `22-23`[i]=1} else{ `22-23`[i]=0}
  if((int_overlaps(createinterval(data$PunchDate[i],23,24), workinterval[i]))){ `23-24`[i]=1} else{ `23-24`[i]=0}


}
cbind(data, `0-1`, `1-2`, `2-3`, `3-4`, `4-5`, `5-6`,
            `6-7`, `7-8`, `8-9`, `9-10`, `10-11`, `11-12`,
            `12-13`, `13-14`, `14-15`, `15-16`, `16-17`, `17-18`, `18-19`,
            `19-20`, `20-21`, `21-22`, `22-23`, `23-24`
      )
RedRose
  • 555
  • 7
  • 27
Cyrus Mohammadian
  • 4,982
  • 6
  • 33
  • 62
  • I don't have time right now to work up a full solution for you but you might benefit from looking at: https://stackoverflow.com/questions/38339812/binning-time-data-in-r and the second answer on https://stackoverflow.com/questions/29833538/convert-12-hour-character-time-to-24-hour – HFBrowning Apr 12 '19 at 19:37
  • Oh and this one too: https://stackoverflow.com/questions/5624140/binning-dates-in-r – HFBrowning Apr 12 '19 at 19:38
  • Thanks. Those links (specially the second one) are helpful but I'm still unable to arrive at my desired solution. – Cyrus Mohammadian Apr 12 '19 at 20:38
  • What have you tried so far that you say was an utter failure? It's still helpful to see some of what your thinking is, even if it didn't work out – camille Apr 15 '19 at 20:06
  • I tried a series of for loops that are extremely slow. – Cyrus Mohammadian Apr 15 '19 at 21:56

3 Answers3

8

This involves bit of fiddling with the dates and time but it seems to work using dcast.

library(lubridate)
library(data.table)

# Data
dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
                Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
                Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
                Time_Out = c("2:30pm","3:33pm","3:00am","1:00am"))

# Create Date Out field, if out the next day then need to add extra day to the Date in
dat$Date <- as.Date(dat$Date, format = "%m/%d/%y")
dat$Date_out <- as.Date(ifelse(grepl("am", dat$Time_Out), dat$Date + days(1), dat$Date), origin = "1970-01-01")

# Create date time in and out variable in format yyyy-dd-dd hh:mm:ss
dat$time_in1 <- strptime(paste(dat$Date, " ", dat$Time_In, sep = ""), format = "%Y-%m-%d %I:%M%p")
dat$time_out1 <- strptime(paste(dat$Date_out, " ", dat$Time_Out, sep = ""), format = "%Y-%m-%d %I:%M%p")

# Fiddling with dates and time
# This will be used to duplicate data frame x times for dcast below
dat$diff_time <- ceiling(as.numeric(difftime(dat$time_out1, dat$time_in1, units = "hours")))

dat$time_in_min <- format(dat$time_in1, format = "%M")
dat$time_out_min <- format(dat$time_out1, format = "%M")
dat$diff_time <- ifelse(dat$time_out_min < dat$time_in_min, dat$diff_time + 1, dat$diff_time)

# For Time in add extra hour to minus time in, i.e. if time in is 2:35pm then time in will show 3:00pm to calculate 25 minutes
dat$time_in2 <- strptime(dat$time_in1 + hours(1), format = "%Y-%m-%d %H")
dat$time_out2 <- strptime(dat$time_out1, format = "%Y-%m-%d %H")

# Calculate fraction of hours for the Time in/Out
dat$diff_in <- as.numeric(difftime(dat$time_in2, dat$time_in1, units = "hours"))
dat$diff_out <- as.numeric(difftime(dat$time_out1, dat$time_out2, units = "hours"))

# For the 24 hour bucket for each person
dat$start_hr <- format(dat$time_in1, format = "%H")

# Append Data multiple times based on number of hours in between Out and In
dt <- dat[rep(seq_len(nrow(dat)), dat$diff_time), c("Date", "Person", "Time_In", "Time_Out", "start_hr", "diff_in", "diff_out", "diff_time")]
dt <- data.table(dt)

# For the 24 hour bucket for each person
dt[, rank := 1:.N, by = c("Person", "Date", "Time_In", "Time_Out")]
dt[, start_hr2 := as.numeric(start_hr) + rank]

# Combine with Time in and Out to allow fraction of hour start and end
dt[, rank2 := 1]
dt[rank == 1, rank2:= diff_in]
dt[diff_time == rank & diff_out > 0, rank2 := diff_out]

# 24 hours in a day
dt[start_hr2 > 24, start_hr2 := start_hr2 - 24]

# For the data provided it works without this line because Alexander worked 24 hours
# Need this line to include all 24 hour bucket
dt$start_hr2 <- factor(dt$start_hr2, levels = 1:24)

dt_dcast <- dcast(dt, Person + Date + Time_In + Time_Out ~ start_hr2, value.var = "rank2", fill = 0, drop = c(TRUE, FALSE))
setnames(dt_dcast, names(dt_dcast), c("Person", "Date", "Time In", "Time Out", paste0(1:24 - 1, "-", 1:24)))
dt_dcast
            Person       Date Time In Time Out 0-1  1-2 2-3 3-4 4-5 5-6 6-7 7-8 8-9 9-10 10-11 11-12 12-13 13-14 14-15 15-16 16-17 17-18 18-19 19-20 20-21 21-22 22-23 23-24
1: Alexandra Wakes 2019-01-02  1:00am   1:00am   1 1.00   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  1.00     1     1     1     1     1     1     1     1
2:     Brian Smith 2019-01-02  1:45am   3:33pm   0 0.25   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  0.55     0     0     0     0     0     0     0     0
3:        Jane Doe 2019-01-04 11:00pm   3:00am   1 1.00   1   0   0   0   0   0   0    0     0     0     0  0.00   0.0  0.00     0     0     0     0     0     0     0     1
4:        John Doe 2019-01-01  1:15pm   2:30pm   0 0.00   0   0   0   0   0   0   0    0     0     0     0  0.75   0.5  0.00     0     0     0     0     0     0     0     0

Using for loop:

library(lubridate)
library(data.table)

# Data
dat<-data.frame(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
                Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
                Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
                Time_Out = c("2:30pm","3:33pm","3:00am","1:00am"))

# Create Date Out field, if out the next day then need to add extra day to the Date in
dat$Date <- as.Date(dat$Date, format = "%m/%d/%y")
dat$Date_out <- as.Date(ifelse(grepl("am", dat$Time_Out), dat$Date + days(1), dat$Date), origin = "1970-01-01")

# Create date time in and out variable in format yyyy-dd-dd hh:mm:ss
dat$time_in <- strptime(paste(dat$Date, " ", dat$Time_In, sep = ""), format = "%Y-%m-%d %I:%M%p")
dat$time_out <- strptime(paste(dat$Date_out, " ", dat$Time_Out, sep = ""), format = "%Y-%m-%d %I:%M%p")

# Create 'hour' gap
dat$diff_time <- ceiling(as.numeric(difftime(dat$time_out, dat$time_in, units = "hours")))
dat$time_in_min <- format(dat$time_in, format = "%M")
dat$time_out_min <- format(dat$time_out, format = "%M")
dat$diff_time <- ifelse(dat$time_out_min < dat$time_in_min, dat$diff_time + 1, dat$diff_time)

# For the 24 hour bucket for each person
dat$start_hr <- as.numeric(format(dat$time_in + hours(1), format = "%H"))
dat$start_hr <- ifelse(dat$start_hr == 0, 24, dat$start_hr)

# For Time in add extra hour to minus time in, i.e. if time in is 2:35pm then time in will show 3:00pm to calculate 25 minutes
dat$hour_in <- strptime(dat$time_in + hours(1), format = "%Y-%m-%d %H")
dat$hour_out <- strptime(dat$time_out, format = "%Y-%m-%d %H")

# Calculate fraction of hours for the Time in/Out
dat$diff_in <- as.numeric(difftime(dat$hour_in, dat$time_in, units = "hours"))
dat$diff_out <- as.numeric(difftime(dat$time_out, dat$hour_out, units = "hours"))

hr_bucket <- data.frame(matrix(0, ncol = 24, nrow = nrow(dat)))
names(hr_bucket) <- paste0(1:24-1, "_", 1:24)

stg_data <- dat[, c("start_hr", "diff_time", "diff_in", "diff_out")]
stg_calc <- cbind(stg_data, hr_bucket)

col_index <- ncol(stg_data)
for (i in 1:nrow(stg_calc)) {

  ref_start_hr <- stg_calc[i ,]$start_hr 
  ref_diff_time <- stg_calc[i ,]$diff_time

  ref_diff_in <- stg_calc[i ,]$diff_in
  ref_diff_out <- stg_calc[i ,]$diff_out

  # if a person works till the next morning
  if ((ref_start_hr + ref_diff_time) > 24) {

    offset_col_used <- 24 - ref_start_hr + 1
    offset_col_rem <- ref_diff_time - offset_col_used

    stg_calc[i, (col_index + ref_start_hr):(col_index + 24)] <- 1
    stg_calc[i, (col_index + 1):(col_index + offset_col_rem)] <- 1

  } else {

    stg_calc[i, (col_index + ref_start_hr):(col_index + ref_start_hr + ref_diff_time - 1)] <- 1

  } 

  # To adjust for fraction of hour worked at start and end
  if (stg_calc[i, ]$diff_in %% 1 > 0) stg_calc[i, col_index + ref_start_hr] <- ref_diff_in
  if (stg_calc[i, ]$diff_out %% 1 > 0) stg_calc[i, col_index + ref_start_hr + ref_diff_time - 1] <- ref_diff_out

}

dat2 <- cbind(dat[, c("Person", "Date", "Time_In", "Time_Out")], stg_calc[, names(hr_bucket)])
dat2
           Person       Date Time_In Time_Out 0_1  1_2 2_3 3_4 4_5 5_6 6_7 7_8 8_9 9_10 10_11 11_12 12_13 13_14 14_15 15_16 16_17 17_18 18_19 19_20 20_21 21_22 22_23 23_24
1        John Doe 2019-01-01  1:15pm   2:30pm   0 0.00   0   0   0   0   0   0   0    0     0     0     0  0.75   0.5  0.00     0     0     0     0     0     0     0     0
2     Brian Smith 2019-01-02  1:45am   3:33pm   0 0.25   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  0.55     0     0     0     0     0     0     0     0
3        Jane Doe 2019-01-04 11:00pm   3:00am   1 1.00   1   0   0   0   0   0   0    0     0     0     0  0.00   0.0  0.00     0     0     0     0     0     0     0     1
4 Alexandra Wakes 2019-01-02  1:00am   1:00am   1 1.00   1   1   1   1   1   1   1    1     1     1     1  1.00   1.0  1.00     1     1     1     1     1     1     1     1
MKa
  • 2,248
  • 16
  • 22
5

This seems to work, though it has a few kludgy steps.

library(tidyverse)
library(lubridate)

(dat<-tibble(Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
             Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
             Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
             Time_Out = c("2:30pm","3:33pm","3:00am","1:00am")))

dat2 <- dat %>%
  mutate(Time_In2 = mdy_hm(paste(Date, Time_In)),
         Time_Out2 = mdy_hm(paste(Date, Time_Out)),
         Time_Out2 = Time_Out2 + if_else(Time_Out2 <= Time_In2, ddays(1), 0)) %>%
  select(Person, Time_In2, Time_Out2) %>%
  gather(type, time, -Person) %>%

  # Kludge #1: gather seems to have converted POSIXct into numeric, switch back
  mutate(time = as.POSIXct(time, origin="1970-01-01", tz = "UTC")) %>%

  # Kludge #2: add rows for all minutes of day for each person.
  #   Clearly not most efficient method! This might be slowish if you have
  #   many thousands of Person values.
  group_by(Person) %>%
  padr::pad(interval = "min") %>%
  mutate(hour = hour(time)) %>%
  # Exclude ending minute to avoid double-counting
  filter(type != "Time_Out2" | is.na(type)) %>%
  ungroup() %>%

  count(Person, hour) %>%
  mutate(n = n/60) %>%
  spread(hour, n, fill = 0)

> dat2
# A tibble: 4 x 25
  Person            `0`   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`  `11`  `12`  `13`  `14`  `15`  `16`  `17`  `18`  `19`  `20`  `21`  `22`  `23`
  <chr>           <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Alexandra Wakes     1  1        1     1     1     1     1     1     1     1     1     1     1  1      1    1        1     1     1     1     1     1     1     1
2 Brian Smith         0  0.25     1     1     1     1     1     1     1     1     1     1     1  1      1    0.55     0     0     0     0     0     0     0     0
3 Jane Doe            1  1        1     0     0     0     0     0     0     0     0     0     0  0      0    0        0     0     0     0     0     0     0     1
4 John Doe            0  0        0     0     0     0     0     0     0     0     0     0     0  0.75   0.5  0        0     0     0     0     0     0     0     0
Jon Spring
  • 55,165
  • 4
  • 35
  • 53
2

I start with @Jon Spring's data frame where the strings are transformed into datetimes. I complete the data frame with all combinations of hours, but you likely could skip that with enough people.

library(tidyverse)
library(lubridate)

dat<-tibble(ID = 1:4,
            Date = c("1/1/19", "1/2/19", "1/4/19", "1/2/19"), 
             Person = c("John Doe", "Brian Smith", "Jane Doe", "Alexandra Wakes"), 
             Time_In = c("1:15pm", "1:45am", "11:00pm", "1:00am"), 
             Time_Out = c("2:30pm","3:33pm","3:00am","1:00am"))

# Stolen from Jon Spring
# https://stackoverflow.com/a/55698472/11355066
dat2<- dat%>%
  mutate(Shift_Start = mdy_hm(paste(Date, Time_In)),
         Shift_End = mdy_hm(paste(Date, Time_Out)),
         Shift_End = Shift_End + if_else(Shift_End <= Shift_Start, ddays(1), 0)) 

# Different solution
dat2%>%
  group_by(ID, Person, Shift_Start, Shift_End)%>%
  do(
    tibble(hours_worked = seq.POSIXt(from = floor_date(.$Shift_Start, 'hour'), to = ceiling_date(.$Shift_End - dhours(), 'hour'), by = 'hour')
           ,hours_values = na.omit(
             c(if_else(minute(.$Shift_Start) == 0, NA_integer_, as.integer(60 - minute(.$Shift_Start)))
               ,rep(60L, length(seq.POSIXt(from = ceiling_date(.$Shift_Start, 'hour'), to = floor_date(.$Shift_End, 'hour'), by = 'hour'))-1)
               ,if_else(minute(.$Shift_End) == 0, NA_integer_, as.integer(minute(.$Shift_End))))
             )/60
           )
    )%>%
  complete(hours_worked = seq.POSIXt(from = min(floor_date(Shift_Start, 'day')), to = max(ceiling_date(Shift_End, 'day')), by = 'hour'))%>%
  mutate(hour_minutes = format(hours_worked, "%H:%M"))%>%
  select(-hours_worked)%>%
  na.omit()%>%
  # ID only grouped to match order of the poster
  # group_by(ID, Person, hour_minutes)%>%summarize(hours_values = sum(hours_values))%>%
  spread(hour_minutes, hours_values, fill = 0)
Cole
  • 11,130
  • 1
  • 9
  • 24