7

I have two columns of PosixLT times with no NA values , yet NA values show up upon check

> sum(is.na(check$start))
[1] 19 
> sum(is.na(check$end))
[1] 23

The data is present in the cells, so why does this happen? I have heard that this can happen with PosixLT but even when I convert this to posixCT, there is very strange behavior. How does one go about solving this?

> as.POSIXct(check$start, format = "%Y-%m-%d %H:%M:%S", tz = "CST6CDT")
 [1] NA                        "2014-03-09 01:35:01 CST" NA                        "2014-03-09 01:53:30 CST" NA                       
 [6] NA                        NA                        NA                        NA                        "2014-03-09 04:17:11 CDT"
[11] NA                        NA                        "2015-03-08 01:54:43 CST" NA                        NA                       
[16] NA                        NA                        NA                        NA                        NA                       
[21] NA                        NA                        NA  


> dput(check)
structure(list(start = structure(list(sec = c(24, 1, 27, 30, 
8, 21, 40, 9, 43, 11, 31, 43, 43, 55, 39, 54, 41, 19, 2, 35, 
6, 54, 40), min = c(45L, 35L, 14L, 53L, 36L, 37L, 47L, 48L, 54L, 
17L, 57L, 53L, 54L, 3L, 52L, 22L, 34L, 28L, 41L, 42L, 52L, 52L, 
53L), hour = c(2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 4L, 2L, 2L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), mday = c(9L, 9L, 
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L), mon = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), 
    year = c(114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 
    114L, 114L, 114L, 115L, 115L, 115L, 115L, 115L, 115L, 115L, 
    115L, 115L, 115L, 115L, 115L), wday = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L), yday = c(67L, 67L, 67L, 67L, 67L, 67L, 67L, 
    67L, 67L, 67L, 67L, 66L, 66L, 66L, 66L, 66L, 66L, 66L, 66L, 
    66L, 66L, 66L, 66L), isdst = c(-1L, 0L, -1L, 0L, -1L, -1L, 
    -1L, -1L, -1L, 1L, -1L, -1L, 0L, -1L, -1L, -1L, -1L, -1L, 
    -1L, -1L, -1L, -1L, -1L), zone = c("", "CST", "", "CST", 
    "", "", "", "", "", "CDT", "", "", "CST", "", "", "", "", 
    "", "", "", "", "", ""), gmtoff = c(NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_)), .Names = c("sec", "min", "hour", "mday", "mon", 
"year", "wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", 
"POSIXt"), tzone = c("CST6CDT", "CST", "CDT")), end = structure(list(
    sec = c(7, 59, 38, 45, 29, 46, 39, 14, 52, 29, 37, 5, 23, 
    41, 10, 43, 46, 46, 53, 24, 57, 13, 51), min = c(55L, 47L, 
    30L, 2L, 43L, 51L, 53L, 56L, 54L, 54L, 57L, 56L, 6L, 3L, 
    13L, 29L, 37L, 32L, 48L, 47L, 55L, 55L, 55L), hour = c(2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L), mday = c(9L, 9L, 9L, 9L, 9L, 
    9L, 9L, 9L, 9L, 9L, 9L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
    8L, 8L, 8L), mon = c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), 
    year = c(114L, 114L, 114L, 114L, 114L, 114L, 114L, 114L, 
    114L, 114L, 114L, 115L, 115L, 115L, 115L, 115L, 115L, 115L, 
    115L, 115L, 115L, 115L, 115L), wday = c(0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L), yday = c(67L, 67L, 67L, 67L, 67L, 67L, 67L, 
    67L, 67L, 67L, 67L, 66L, 66L, 66L, 66L, 66L, 66L, 66L, 66L, 
    66L, 66L, 66L, 66L), isdst = c(-1L, -1L, -1L, -1L, -1L, -1L, 
    -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, 
    -1L, -1L, -1L, -1L, -1L), zone = c("", "", "", "", "", "", 
    "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
    "", ""), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
    )), .Names = c("sec", "min", "hour", "mday", "mon", "year", 
"wday", "yday", "isdst", "zone", "gmtoff"), class = c("POSIXlt", 
"POSIXt"), tzone = c("CST6CDT", "CST", "CDT"))), .Names = c("start", 
"end"), row.names = c(1559963L, 1560092L, 1560157L, 1560220L, 
1560240L, 1560247L, 1560252L, 1560253L, 1560255L, 1560258L, 1560260L, 
2004432L, 2004583L, 2004591L, 2004594L, 2004596L, 2004598L, 2004599L, 
2004600L, 2004603L, 2004609L, 2004610L, 2004611L), class = "data.frame")
iskandarblue
  • 7,208
  • 15
  • 60
  • 130
  • This is similar to another question from a couple of days ago: http://stackoverflow.com/questions/36648502/modified-date-inside-data-frame-becomes-na-after-selection. There as no resolution there either. – Dave2e Apr 18 '16 at 14:57
  • Everything seems to work for me with R version 3.2.5 on ubuntu. – Richard Telford Apr 18 '16 at 15:11
  • Could you show the data using `dput` ? – iskandarblue Apr 18 '16 at 15:16
  • as.POSIXct(check$start, format = "%Y-%m-%d %H:%M:%S", tz = "GMT") works! There seems to be a problem with other timezones or times lacking timezones. – Dave2e Apr 18 '16 at 15:40

1 Answers1

8

How works is.na in this context ?

> is.na.POSIXlt
function (x) 
is.na(as.POSIXct(x))
<bytecode: 0x0000000014232980>

How does as.POSIXct behave here ?

> as.POSIXct(check$start)
 [1] NA                        "2014-03-09 01:35:01 CST" NA                        "2014-03-09 01:53:30 CST"
 [5] NA                        NA                        NA                        NA                       
 [9] NA                        "2014-03-09 04:17:11 CDT" NA                        NA                       
[13] "2015-03-08 01:54:43 CST" NA                        NA                        NA                       
[17] NA                        NA                        NA                        NA                       
[21] NA                        NA                        NA                       

Ok, but WHY ????

Let's check the doc of as.POSIXct:

Any conversion that needs to go between the two date-time classes requires a time zone: conversion from "POSIXlt" to "POSIXct" will validate times in the selected time zone. One issue is what happens at transitions to and from DST, for example in the UK

Let's see:

> check$start$zone
 [1] ""    "CST" ""    "CST" ""    ""    ""    ""    ""    "CDT" ""    ""    "CST" ""    ""    ""    ""    ""    ""    ""   
[21] ""    ""    ""   

An here's the dragons, there's no timezone except for 4 entries, so as.POSIXct can't tell if the dates are valid (within DST change or not ?) as you can see with:

> check$start$isdst
 [1] -1  0 -1  0 -1 -1 -1 -1 -1  1 -1 -1  0 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1

So the converstion between POSIXlt (your dataframe) and POSIXct can't guess if the date is valid, and return NA.

One fixing method could be to enforce a timezone on all records:

> check$start <- as.POSIXlt(strftime(check$start,tz="CST"),tz="CST6CDT")
> is.na(check$start)
 [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Tensibai
  • 15,557
  • 1
  • 37
  • 57