to_datetime poor performance parsing string datetimes · Issue #1571 · pandas-dev/pandas (original) (raw)

Hi,

I want to convert to datetime64 a Series that contains datetimes as strings. The format is '%Y-%m-%d %H:%M:%S' ('2012-07-06 10:05:58', for instance).

Casting the strings array into a datetime64 array in numpy (or using Series.astype("datetime64")) is fast, but it transforms the datetimes according to the local timezone, which is not the behavior I want in this case. Pandas to_datetime function does the parsing right, but it is much slower.

However, it is also possible to do the parsing right and fast with numpy by appending the "+0000" timezone suffix to every string before parsing/casting to datetime64. So I wonder, is there any reason why to_datetime() runs much slower than this approach?

Thanks and regards.

Some sample code to illustrate the issue:

import os import time import numpy as np from datetime import datetime, timedelta from pandas import DatetimeIndex, Series, to_datetime from pandas.tseries.offsets import Minute

DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S"

Decorator to execute code in a faked GMT timezone

def GMT_Timezone(func): def wrapper(*args, **kwargs): # Set timezone to GMT previous_tz = time.tzname[0] os.environ['TZ'] = 'GMT' time.tzset() # Execute function result = func(*args, **kwargs) # Set timezone to previous state os.environ['TZ'] = previous_tz time.tzset() return result return wrapper

def generate_test_data(): start_date = datetime(2012, 1, 1) end_date = datetime(2012, 1, 31) minutes = DatetimeIndex(start=start_date, end=end_date, freq=Minute()) str_minutes = Series(minutes.map(lambda d: d.strftime(DATETIME_FORMAT))) return str_minutes

def simple_casting(data): return data.astype("datetime64")

def shifting_afterwards(data): casted = data.astype("datetime64") shifted = DatetimeIndex(casted) - timedelta(seconds=time.altzone) return Series(shifted)

def concat_gmt_tz(data): data = np.add(data, "+0000") return data.astype("datetime64")

def using_to_datetime(data): return to_datetime(data)

@GMT_Timezone def faking_tz(data): return data.astype("datetime64")

if name == 'main': test_data = generate_test_data()

# Some aproaches to parse the datetime string
using_to_dt = using_to_datetime(test_data)
simple = simple_casting(test_data)
shifted = shifting_afterwards(test_data)
faked_tz = faking_tz(test_data)
concat_tz = concat_gmt_tz(test_data)

# Simple casting does not work. During the process, numpy converts the date
# into GMT according to the local timezone.
assert not (simple == using_to_dt).all(), "Fails if run from GMT timezone."
# Shifting after casting does not work either, the problem being that
# we should shift using a different timezone depending on the date we are
# parsing (summer vs winter) -> more complex + error-prone + slower
assert not (shifted == using_to_dt).all()
# Faking the timezone to be GMT avoids datetime transformations, so it seems
# to work, although since we are messing with environmental variables
# it gets risky (multithreading, other side-effects?)
assert (faked_tz == using_to_dt).all()
assert (concat_tz == using_to_dt).all()

# Now to performance
from timeit import Timer
ITERATIONS = 5
t = Timer("using_to_datetime(test_data)",
          "from __main__ import generate_test_data, using_to_datetime;" + \
          "test_data = generate_test_data()")
t_using_to_dt = t.timeit(ITERATIONS) / ITERATIONS

t = Timer("faking_tz(test_data)",
          "from __main__ import generate_test_data, faking_tz;" + \
          "test_data = generate_test_data()")
t_faking_tz = t.timeit(ITERATIONS) / ITERATIONS

t = Timer("concat_gmt_tz(test_data)",
          "from __main__ import generate_test_data, concat_gmt_tz;" + \
          "test_data = generate_test_data()")
t_concat_tz = t.timeit(ITERATIONS) / ITERATIONS

print "to_datetime():", t_using_to_dt
print "faking tz:", t_faking_tz
# to_datetime() ~90 times slower than .astype("datetime64")
print "Ratio:", t_using_to_dt / t_faking_tz

print

print "to_datetime():", t_using_to_dt
print "concat tz:", t_concat_tz
# to_datetime() ~270 times slower than concat tz + .astype("datetime64")
print "Ratio:", t_using_to_dt / t_concat_tz