Thanks to everyone who contributed. I went down the rabbit hole myself, only to notice that it's indeed a deep rabbit hole.
The first approach was based on my initial idea extended by a create_dt_patterns() function, that generates a wide combination of patterns. After this list exceeded 500 patterns, I tweaked the code a bit to produce only patterns that fit the separators in the string. I combined this with a counter to only return the most frequently used pattern. This helps with ambiguous results. If you're interested in my potentially ugly and incomplete code, here you go:
def get_dt_format(col: list, patterns: list, n: int = 100):
for val in col[:n]:
for pattern in patterns.keys():
try:
_ = datetime.strptime(val, pattern)
patterns[pattern] += 1
except:
pass
if sum(patterns.values()) == 0:
return False
return max(patterns, key=patterns.get)
def create_dt_patterns(dt_str: str):
dates = create_date_patterns(dt_str)
times = create_time_patterns(dt_str)
seps = create_dt_separators(dt_str)
dts = [d + s + t for d in dates for s in seps for t in times]
dts += dates + times
return {x: 0 for x in dts}
def create_dt_separators(dt_str: str):
seps = [x for x in ["T", " "] if x in dt_str]
return [""] if len(seps) == 0 else seps
def create_time_patterns(dt_str: str):
time_opts = ["%H{0}%M{0}%S.%f", "%H{0}%M{0}%S", "%H{0}%M"]
return [t.format(":" if ":" in dt_str else "") for t in time_opts]
def create_date_patterns(dt_str: str):
date_opts = [
"%Y{0}%m{0}%d",
"%m{0}%d{0}%Y",
"%d{0}%m{0}%Y",
"%y{0}%m{0}%d",
"%m{0}%d{0}%y",
"%d{0}%m{0}%y",
]
dates = [d.format(s) for d in date_opts for s in ["-", "/", "."] if s in dt_str]
if len(dates) == 0:
dates = [d.format("") for d in date_opts]
return dates
Then I took a shower because I felt dirty writing this code and also noticed that strptime
is not a very fast function. Doing this for n
entries using m
different patterns over k
columns takes some time.
The next stop was a RegEx approach, which is much quicker and made me feel a little better about myself. It's roughly 30x faster than trying different patterns.
Since I'm planning on testing and developing this logic over time, I created a GitHub repo for it. To also post it here, which I probably will not update:
DATE_RE = r"(?P<date>\d{2,4}[-/.]\d{2}[-/.]\d{2,4})?"
SEP_RE = r"(?P<sep>\s|T)?"
TIME_RE = r"(?P<time>\d{2}:\d{2}(:\d{2})?\s*([AP]M)?)?"
FULL_RE = DATE_RE + SEP_RE + TIME_RE
YMD_RE = r"^(?P<ay>(?:[12][0-9])?[0-9]{2})(?P<bs>[-/.])(?P<cm>0[1-9]|1[0-2])(?P<ds>[-/.])(?P<ed>0[1-9]|[12][0-9]|3[01])$"
DMY_RE = r"^(?P<ad>0[1-9]|[12][0-9]|3[01])(?P<bs>[-/.])(?P<cm>0[1-9]|1[0-2])(?P<ds>[-/.])(?P<ey>(?:[12][0-9])?[0-9]{2})$"
MDY_RE = r"^(?P<am>0[1-9]|1[0-2])(?P<bs>[-/.])(?P<cd>0[1-9]|[12][0-9]|3[01])(?P<ds>[-/.])(?P<ey>(?:[12][0-9])?[0-9]{2})$"
HMS_RE = r"^(?P<aH>\d{1,2})(?P<bs>:?)(?P<cM>\d{2})(?:(?P<ds>:?)(?P<eS>\d{2}))?(?:(?P<fs>\s)?(?P<ga>[AP]M))?$"
def guess_datetime_format(values: list[str], n=100, return_dict=False):
di = {}
for val in values[:n]:
if val is None:
continue
fmts = datetime_formats(val)
for fmt in fmts:
if fmt not in di:
di[fmt] = 1
di[fmt] += 1
if len(di) == 0:
return None
if return_dict:
return di
return max(di, key=di.get)
def datetime_formats(value: str) -> list:
assert "," not in value # TODO: handle these cases
m = re.match(FULL_RE, value)
dates = "" if m["date"] is None else date_formats(m["date"])
sep = "" if m["sep"] is None else m["sep"]
time = "" if m["time"] is None else time_format(m["time"])
return [date + sep + time for date in dates]
def date_formats(date_value: str) -> list:
matches = []
for p in [YMD_RE, MDY_RE, DMY_RE]:
m = re.match(p, date_value)
if m is None:
continue
fmt = ""
for c in sorted(m.groupdict().keys()):
if c[1] == "s": # separator character
fmt += "" if m[c] is None else m[c]
else: # year, month, day
fmt += "%" + c[1] if len(m[c]) == 2 else "%Y"
matches.append(fmt)
return matches
def time_format(time_value: str) -> str:
m = re.match(HMS_RE, time_value)
fmt = ""
for c in sorted(m.groupdict().keys()):
if c[1] == "s": # separator character
fmt += "" if m[c] is None else m[c]
else:
fmt += "" if m[c] is None else "%" + c[1]
if "M" in time_value: # AM or PM
fmt = fmt.replace("%H", "%I")
return fmt