pandas-dev · jreback · May 29, 2018 · Mar 18, 2018 · Feb 22, 2018 · Mar 14, 2018
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -16,6 +16,7 @@ New features
 ~~~~~~~~~~~~
 
 - :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with MultiIndex (:issue:`21115`)
+- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`)
 
 
 .. _whatsnew_0231.deprecations:

diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx
@@ -20,6 +20,7 @@ except:
  except:
  from _dummy_thread import allocate_lock as _thread_allocate_lock
 
+import pytz
 
 from cython cimport Py_ssize_t
 from cpython cimport PyFloat_Check
@@ -40,6 +41,27 @@ from util cimport is_string_object
 from nattype cimport checknull_with_nat, NPY_NAT
 from nattype import nat_strings
 
+cdef dict _parse_code_table = {'y': 0,
+ 'Y': 1,
+ 'm': 2,
+ 'B': 3,
+ 'b': 4,
+ 'd': 5,
+ 'H': 6,
+ 'I': 7,
+ 'M': 8,
+ 'S': 9,
+ 'f': 10,
+ 'A': 11,
+ 'a': 12,
+ 'w': 13,
+ 'j': 14,
+ 'U': 15,
+ 'W': 16,
+ 'Z': 17,
+ 'p': 18, # an additional key, only with I
+ 'z': 19}
+
 
 def array_strptime(ndarray[object] values, object fmt,
  bint exact=True, errors='raise'):
@@ -58,15 +80,15 @@ def array_strptime(ndarray[object] values, object fmt,
  Py_ssize_t i, n = len(values)
  pandas_datetimestruct dts
  ndarray[int64_t] iresult
- int year, month, day, minute, hour, second, weekday, julian, tz
- int week_of_year, week_of_year_start
+ ndarray[object] result_timezone
+ int year, month, day, minute, hour, second, weekday, julian
+ int week_of_year, week_of_year_start, parse_code, ordinal
  int64_t us, ns
- object val, group_key, ampm, found
+ object val, group_key, ampm, found, timezone
  dict found_key
  bint is_raise = errors=='raise'
  bint is_ignore = errors=='ignore'
  bint is_coerce = errors=='coerce'
- int ordinal
 
  assert is_raise or is_ignore or is_coerce
 
@@ -79,6 +101,8 @@ def array_strptime(ndarray[object] values, object fmt,
  in fmt):
  raise ValueError("Cannot use '%W' or '%U' without "
  "day and year")
+ elif '%Z' in fmt and '%z' in fmt:
+ raise ValueError("Cannot parse both %Z and %z")
 
  global _TimeRE_cache, _regex_cache
  with _cache_lock:
@@ -108,32 +132,10 @@ def array_strptime(ndarray[object] values, object fmt,
 
  result = np.empty(n, dtype='M8[ns]')
  iresult = result.view('i8')
+ result_timezone = np.empty(n, dtype='object')
 
  dts.us = dts.ps = dts.as = 0
 
- cdef dict _parse_code_table = {
- 'y': 0,
- 'Y': 1,
- 'm': 2,
- 'B': 3,
- 'b': 4,
- 'd': 5,
- 'H': 6,
- 'I': 7,
- 'M': 8,
- 'S': 9,
- 'f': 10,
- 'A': 11,
- 'a': 12,
- 'w': 13,
- 'j': 14,
- 'U': 15,
- 'W': 16,
- 'Z': 17,
- 'p': 18 # just an additional key, works only with I
- }
- cdef int parse_code
-
  for i in range(n):
  val = values[i]
  if is_string_object(val):
@@ -176,7 +178,7 @@ def array_strptime(ndarray[object] values, object fmt,
  year = 1900
  month = day = 1
  hour = minute = second = ns = us = 0
- tz = -1
+ timezone = None
  # Default to -1 to signify that values not known; not critical to have,
  # though
  week_of_year = -1
@@ -266,21 +268,10 @@ def array_strptime(ndarray[object] values, object fmt,
  # W starts week on Monday.
  week_of_year_start = 0
  elif parse_code == 17:
- # Since -1 is default value only need to worry about setting tz
- # if it can be something other than -1.
- found_zone = found_dict['Z'].lower()
- for value, tz_values in enumerate(locale_time.timezone):
- if found_zone in tz_values:
- # Deal w/ bad locale setup where timezone names are the
- # same and yet time.daylight is true; too ambiguous to
- # be able to tell what timezone has daylight savings
- if (time.tzname[0] == time.tzname[1] and
- time.daylight and found_zone not in (
- "utc", "gmt")):
- break
- else:
- tz = value
- break
+ timezone = pytz.timezone(found_dict['Z'])
+ elif parse_code == 19:
+ timezone = parse_timezone_directive(found_dict['z'])
+
  # If we know the wk of the year and what day of that wk, we can figure
  # out the Julian day of the year.
  if julian == -1 and week_of_year != -1 and weekday != -1:
@@ -330,7 +321,9 @@ def array_strptime(ndarray[object] values, object fmt,
  continue
  raise
 
- return result
+ result_timezone[i] = timezone
+
+ return result, result_timezone
 
 
 """_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored
@@ -538,14 +531,13 @@ class TimeRE(dict):
  # XXX: Does 'Y' need to worry about having less or more than
  # 4 digits?
  'Y': r"(?P<Y>\d\d\d\d)",
+ 'z': r"(?P<z>[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|Z)",
  'A': self.__seqToRE(self.locale_time.f_weekday, 'A'),
  'a': self.__seqToRE(self.locale_time.a_weekday, 'a'),
  'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'),
  'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'),
  'p': self.__seqToRE(self.locale_time.am_pm, 'p'),
- 'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone
- for tz in tz_names],
- 'Z'),
+ 'Z': self.__seqToRE(pytz.all_timezones, 'Z'),
  '%': '%'})
  base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
  base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
@@ -632,3 +624,50 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year,
  else:
  days_to_week = week_0_length + (7 * (week_of_year - 1))
  return 1 + days_to_week + day_of_week
+
+cdef parse_timezone_directive(object z):
+ """
+ Parse the '%z' directive and return a pytz.FixedOffset
+
+ Parameters
+ ----------
+ z : string of the UTC offset
+
+ Returns
+ -------
+ pytz.FixedOffset
+
+ Notes
+ -----
+ This is essentially similar to the cpython implementation
+ https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479
+ """
+
+ cdef:
+ int gmtoff_fraction, hours, minutes, seconds, pad_number, microseconds
+ int total_minutes
+ object gmtoff_remainder, gmtoff_remainder_padding
+
+ if z == 'Z':
+ return pytz.FixedOffset(0)
+ if z[3] == ':':
+ z = z[:3] + z[4:]
+ if len(z) > 5:
+ if z[5] != ':':
+ msg = "Inconsistent use of : in {0}"
+ raise ValueError(msg.format(z))
+ z = z[:5] + z[6:]
+ hours = int(z[1:3])
+ minutes = int(z[3:5])
+ seconds = int(z[5:7] or 0)
+
+ # Pad to always return microseconds.
+ gmtoff_remainder = z[8:]
+ pad_number = 6 - len(gmtoff_remainder)
+ gmtoff_remainder_padding = "0" * pad_number
+ microseconds = int(gmtoff_remainder + gmtoff_remainder_padding)
+
+ total_minutes = ((hours * 60) + minutes + (seconds / 60) +
+ (microseconds / 60000000))
+ total_minutes = -total_minutes if z.startswith("-") else total_minutes
+ return pytz.FixedOffset(total_minutes)
diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py
@@ -1,7 +1,8 @@
 from datetime import datetime, timedelta, time
-import numpy as np
 from collections import MutableMapping
 
+import numpy as np
+
 from pandas._libs import tslib
 from pandas._libs.tslibs.strptime import array_strptime
 from pandas._libs.tslibs import parsing, conversion
@@ -27,6 +28,7 @@
  ABCDataFrame)
 from pandas.core.dtypes.missing import notna
 from pandas.core import algorithms
+from pandas.compat import zip
 
 
 def _guess_datetime_format_for_array(arr, **kwargs):
@@ -103,6 +105,41 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None):
  return result.values
 
 
+def _return_parsed_timezone_results(result, timezones, box, tz):
+ """
+ Return results from array_strptime if a %z or %Z directive was passed.
+
+ Parameters
+ ----------
+ result : ndarray
+ int64 date representations of the dates
+ timezones : ndarray
+ pytz timezone objects
+ box : boolean
+ True boxes result as an Index-like, False returns an ndarray
+ tz : object
+ None or pytz timezone object
+ Returns
+ -------
+ tz_result : ndarray of parsed dates with timezone
+ Returns:
+
+ - Index-like if box=True
+ - ndarray of Timestamps if box=False
+
+ """
+ if tz is not None:
+ raise ValueError("Cannot pass a tz argument when "
+ "parsing strings with timezone "
+ "information.")
+ tz_results = np.array([tslib.Timestamp(res).tz_localize(tz) for res, tz
+ in zip(result, timezones)])
+ if box:
+ from pandas import Index
+ return Index(tz_results)
+ return tz_results
+
+
 def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
  utc=None, box=True, format=None, exact=True,
  unit=None, infer_datetime_format=False, origin='unix',
@@ -343,8 +380,11 @@ def _convert_listlike(arg, box, format, name=None, tz=tz):
  # fallback
  if result is None:
  try:
- result = array_strptime(arg, format, exact=exact,
- errors=errors)
+ result, timezones = array_strptime(
+ arg, format, exact=exact, errors=errors)
+ if '%Z' in format or '%z' in format:
+ return _return_parsed_timezone_results(
+ result, timezones, box, tz)
  except tslib.OutOfBoundsDatetime:
  if errors == 'raise':
  raise

diff --git a/pandas/tests/indexes/datetimes/test_tools.py b/pandas/tests/indexes/datetimes/test_tools.py
@@ -179,6 +179,55 @@ def test_to_datetime_format_weeks(self, cache):
  for s, format, dt in data:
  assert to_datetime(s, format=format, cache=cache) == dt
 
+ @pytest.mark.parametrize("box,const,assert_equal", [
+ [True, pd.Index, 'assert_index_equal'],
+ [False, np.array, 'assert_numpy_array_equal']])
+ @pytest.mark.parametrize("fmt,dates,expected_dates", [
+ ['%Y-%m-%d %H:%M:%S %Z',
+ ['2010-01-01 12:00:00 UTC'] * 2,
+ [pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2],
+ ['%Y-%m-%d %H:%M:%S %Z',
+ ['2010-01-01 12:00:00 UTC',
+ '2010-01-01 12:00:00 GMT',
+ '2010-01-01 12:00:00 US/Pacific'],
+ [pd.Timestamp('2010-01-01 12:00:00', tz='UTC'),
+ pd.Timestamp('2010-01-01 12:00:00', tz='GMT'),
+ pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]],
+ ['%Y-%m-%d %H:%M:%S %z',
+ ['2010-01-01 12:00:00 +0100'] * 2,
+ [pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(60))] * 2],
+ ['%Y-%m-%d %H:%M:%S %z',
+ ['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'],
+ [pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(60)),
+ pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(-60))]],
+ ['%Y-%m-%d %H:%M:%S %z',
+ ['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'],
+ [pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(0)),
+ pd.Timestamp('2010-01-01 12:00:00',
+ tzinfo=pytz.FixedOffset(0))]]])
+ def test_to_datetime_parse_tzname_or_tzoffset(self, box, const,
+ assert_equal, fmt,
+ dates, expected_dates):
+ # GH 13486
+ result = pd.to_datetime(dates, format=fmt, box=box)
+ expected = const(expected_dates)
+ getattr(tm, assert_equal)(result, expected)
+
+ with pytest.raises(ValueError):
+ pd.to_datetime(dates, format=fmt, box=box, utc=True)
+
+ @pytest.mark.parametrize('offset', [
+ '+0', '-1foo', 'UTCbar', ':10', '+01:000:01'])
+ def test_to_datetime_parse_timezone_malformed(self, offset):
+ fmt = '%Y-%m-%d %H:%M:%S %z'
+ date = '2010-01-01 12:00:00 ' + offset
+ with pytest.raises(ValueError):
+ pd.to_datetime([date], format=fmt)
+
 
 class TestToDatetime(object):
  def test_to_datetime_pydatetime(self):