-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
ENH: Parse %z and %Z directive in format for to_datetime #19979
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 27 commits
4a43815 cb47c08 f299aec 259ec8f 77af4db 54c2491 0e2a0cd d31e141 7bdbdf4 3e3d5c6 c16ef8c 6f0b7f0 0525823 4c22808 24e1c0a 4f2f865 145e5da 64bc3fc 47a9d69 1b44554 0dcc59f 149781b d99ef5a 0e5e3c6 9a2ea19 924859e a1599a0 6c80c2e abccc3e 473a0f4 ab0a692 56fc683 85bd45e eb2a661 5500ca8 0e0d0fd 34f638c 757458d File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -20,6 +20,7 @@ except: | |
| except: | ||
| from _dummy_thread import allocate_lock as _thread_allocate_lock | ||
| | ||
| import pytz | ||
| | ||
| from cython cimport Py_ssize_t | ||
| from cpython cimport PyFloat_Check | ||
| | @@ -58,15 +59,36 @@ def array_strptime(ndarray[object] values, object fmt, | |
| Py_ssize_t i, n = len(values) | ||
| pandas_datetimestruct dts | ||
| ndarray[int64_t] iresult | ||
| int year, month, day, minute, hour, second, weekday, julian, tz | ||
| int week_of_year, week_of_year_start | ||
| ndarray[object] result_timezone | ||
| int year, month, day, minute, hour, second, weekday, julian | ||
| int week_of_year, week_of_year_start, parse_code | ||
| int64_t us, ns | ||
| object val, group_key, ampm, found | ||
| object val, group_key, ampm, found, timezone | ||
| dict found_key | ||
| bint is_raise = errors=='raise' | ||
| bint is_ignore = errors=='ignore' | ||
| bint is_coerce = errors=='coerce' | ||
| int ordinal | ||
| dict _parse_code_table = {'y': 0, | ||
| 'Y': 1, | ||
| 'm': 2, | ||
| 'B': 3, | ||
| 'b': 4, | ||
| 'd': 5, | ||
| 'H': 6, | ||
| 'I': 7, | ||
| 'M': 8, | ||
| 'S': 9, | ||
| 'f': 10, | ||
| 'A': 11, | ||
| 'a': 12, | ||
| 'w': 13, | ||
| 'j': 14, | ||
| 'U': 15, | ||
| 'W': 16, | ||
| 'Z': 17, | ||
| 'p': 18, # an additional key, only with I | ||
| 'z': 19} | ||
| | ||
| assert is_raise or is_ignore or is_coerce | ||
| | ||
| | @@ -108,32 +130,10 @@ def array_strptime(ndarray[object] values, object fmt, | |
| | ||
| result = np.empty(n, dtype='M8[ns]') | ||
| iresult = result.view('i8') | ||
| result_timezone = np.empty(n, dtype='object') | ||
| | ||
| dts.us = dts.ps = dts.as = 0 | ||
| | ||
| cdef dict _parse_code_table = { | ||
| 'y': 0, | ||
| 'Y': 1, | ||
| 'm': 2, | ||
| 'B': 3, | ||
| 'b': 4, | ||
| 'd': 5, | ||
| 'H': 6, | ||
| 'I': 7, | ||
| 'M': 8, | ||
| 'S': 9, | ||
| 'f': 10, | ||
| 'A': 11, | ||
| 'a': 12, | ||
| 'w': 13, | ||
| 'j': 14, | ||
| 'U': 15, | ||
| 'W': 16, | ||
| 'Z': 17, | ||
| 'p': 18 # just an additional key, works only with I | ||
| } | ||
| cdef int parse_code | ||
| | ||
| for i in range(n): | ||
| val = values[i] | ||
| if is_string_object(val): | ||
| | @@ -176,7 +176,7 @@ def array_strptime(ndarray[object] values, object fmt, | |
| year = 1900 | ||
| month = day = 1 | ||
| hour = minute = second = ns = us = 0 | ||
| tz = -1 | ||
| timezone = None | ||
| # Default to -1 to signify that values not known; not critical to have, | ||
| # though | ||
| week_of_year = -1 | ||
| | @@ -266,21 +266,10 @@ def array_strptime(ndarray[object] values, object fmt, | |
| # W starts week on Monday. | ||
| week_of_year_start = 0 | ||
| elif parse_code == 17: | ||
| # Since -1 is default value only need to worry about setting tz | ||
| # if it can be something other than -1. | ||
| found_zone = found_dict['Z'].lower() | ||
| for value, tz_values in enumerate(locale_time.timezone): | ||
| if found_zone in tz_values: | ||
| # Deal w/ bad locale setup where timezone names are the | ||
| # same and yet time.daylight is true; too ambiguous to | ||
| # be able to tell what timezone has daylight savings | ||
| if (time.tzname[0] == time.tzname[1] and | ||
| time.daylight and found_zone not in ( | ||
| "utc", "gmt")): | ||
| break | ||
| else: | ||
| tz = value | ||
| break | ||
| timezone = pytz.timezone(found_dict['Z']) | ||
| elif parse_code == 19: | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you move this whole parse to a function and just all it here (and return the values as a tuple) | ||
| timezone = parse_timezone_directive(found_dict['z']) | ||
| | ||
| # If we know the wk of the year and what day of that wk, we can figure | ||
| # out the Julian day of the year. | ||
| if julian == -1 and week_of_year != -1 and weekday != -1: | ||
| | @@ -330,7 +319,9 @@ def array_strptime(ndarray[object] values, object fmt, | |
| continue | ||
| raise | ||
| | ||
| return result | ||
| result_timezone[i] = timezone | ||
| | ||
| return result, result_timezone | ||
| | ||
| | ||
| """_getlang, LocaleTime, TimeRE, _calc_julian_from_U_or_W are vendored | ||
| | @@ -538,14 +529,13 @@ class TimeRE(dict): | |
| # XXX: Does 'Y' need to worry about having less or more than | ||
| # 4 digits? | ||
| 'Y': r"(?P<Y>\d\d\d\d)", | ||
| 'z': r"(?P<z>[+-]\d\d:?[0-5]\d(:?[0-5]\d(\.\d{1,6})?)?|Z)", | ||
| 'A': self.__seqToRE(self.locale_time.f_weekday, 'A'), | ||
| 'a': self.__seqToRE(self.locale_time.a_weekday, 'a'), | ||
| 'B': self.__seqToRE(self.locale_time.f_month[1:], 'B'), | ||
| 'b': self.__seqToRE(self.locale_time.a_month[1:], 'b'), | ||
| 'p': self.__seqToRE(self.locale_time.am_pm, 'p'), | ||
| 'Z': self.__seqToRE([tz for tz_names in self.locale_time.timezone | ||
| for tz in tz_names], | ||
| 'Z'), | ||
| 'Z': self.__seqToRE(pytz.all_timezones, 'Z'), | ||
| '%': '%'}) | ||
| base.__setitem__('W', base.__getitem__('U').replace('U', 'W')) | ||
| base.__setitem__('c', self.pattern(self.locale_time.LC_date_time)) | ||
| | @@ -632,3 +622,50 @@ cdef _calc_julian_from_U_or_W(int year, int week_of_year, | |
| else: | ||
| days_to_week = week_0_length + (7 * (week_of_year - 1)) | ||
| return 1 + days_to_week + day_of_week | ||
| | ||
| cdef parse_timezone_directive(object z): | ||
| """ | ||
| Parse the '%z' directive and return a pytz.FixedOffset | ||
| | ||
| Parameters | ||
| ---------- | ||
| z : string of the UTC offset | ||
| | ||
| Returns | ||
| ------- | ||
| pytz.FixedOffset | ||
| | ||
| Notes | ||
| ----- | ||
| This is essentially similar to the cpython implementation | ||
| https://github.com/python/cpython/blob/master/Lib/_strptime.py#L457-L479 | ||
| """ | ||
| | ||
| cdef: | ||
| int gmtoff_fraction, hours, minutes, seconds, pad_number, microseconds | ||
| int total_minutes | ||
| object gmtoff_remainder, gmtoff_remainder_padding | ||
| | ||
| if z == 'Z': | ||
| return pytz.FixedOffset(0) | ||
| if z[3] == ':': | ||
| z = z[:3] + z[4:] | ||
| if len(z) > 5: | ||
| if z[5] != ':': | ||
| msg = "Inconsistent use of : in {0}" | ||
| raise ValueError(msg.format(z)) | ||
| z = z[:5] + z[6:] | ||
| hours = int(z[1:3]) | ||
| minutes = int(z[3:5]) | ||
| seconds = int(z[5:7] or 0) | ||
| | ||
| # Pad to always return microseconds. | ||
| gmtoff_remainder = z[8:] | ||
| pad_number = 6 - len(gmtoff_remainder) | ||
| gmtoff_remainder_padding = "0" * pad_number | ||
| microseconds = int(gmtoff_remainder + gmtoff_remainder_padding) | ||
| | ||
| total_minutes = ((hours * 60) + minutes + (seconds / 60) + | ||
| (microseconds / 60000000)) | ||
| total_minutes = -total_minutes if z.startswith("-") else total_minutes | ||
| return pytz.FixedOffset(total_minutes) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,8 @@ | ||
| from datetime import datetime, timedelta, time | ||
| import numpy as np | ||
| from collections import MutableMapping | ||
| | ||
| import numpy as np | ||
| | ||
| from pandas._libs import tslib | ||
| from pandas._libs.tslibs.strptime import array_strptime | ||
| from pandas._libs.tslibs import parsing, conversion | ||
| | @@ -27,6 +28,7 @@ | |
| ABCDataFrame) | ||
| from pandas.core.dtypes.missing import notna | ||
| from pandas.core import algorithms | ||
| from pandas.compat import zip | ||
| | ||
| | ||
| def _guess_datetime_format_for_array(arr, **kwargs): | ||
| | @@ -103,6 +105,36 @@ def _convert_and_box_cache(arg, cache_array, box, errors, name=None): | |
| return result.values | ||
| | ||
| | ||
| def _return_parsed_timezone_results(result, timezones, box): | ||
| """ | ||
| Return results from array_strptime if a %z or %Z directive was passed. | ||
| | ||
| Parameters | ||
| ---------- | ||
| result : ndarray | ||
| int64 date representations of the dates | ||
| timezones : ndarray | ||
| pytz timezone objects | ||
| box : boolean | ||
| True boxes result as an Index-like, False returns an ndarray | ||
| | ||
| Returns | ||
| ------- | ||
| tz_result : ndarray of parsed dates with timezone | ||
| Returns: | ||
| | ||
| - Index-like if box=True | ||
| - ndarray of Timestamps if box=False | ||
| | ||
| """ | ||
| tz_results = np.array([tslib.Timestamp(res).tz_localize(tz) for res, tz | ||
| in zip(result, timezones)]) | ||
| if box: | ||
| from pandas import Index | ||
| return Index(tz_results) | ||
| return tz_results | ||
| | ||
| | ||
| def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, | ||
| utc=None, box=True, format=None, exact=True, | ||
| unit=None, infer_datetime_format=False, origin='unix', | ||
| | @@ -343,8 +375,20 @@ def _convert_listlike(arg, box, format, name=None, tz=tz): | |
| # fallback | ||
| if result is None: | ||
| try: | ||
| result = array_strptime(arg, format, exact=exact, | ||
| errors=errors) | ||
| parsing_tzname = '%Z' in format | ||
| ||
| parsing_tzoffset = '%z' in format | ||
| if parsing_tzoffset and parsing_tzname: | ||
| raise ValueError("Cannot parse both %Z and %z") | ||
| elif tz is not None and (parsing_tzname or | ||
| parsing_tzoffset): | ||
| raise ValueError("Cannot pass a tz argument when " | ||
| "parsing strings with timezone " | ||
| "information.") | ||
| result, timezones = array_strptime( | ||
| Contributor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would much rather do the error handling in the _return_parsed_timezone_results. This block is just very complicated and hard to grok | ||
| arg, format, exact=exact, errors=errors) | ||
| if parsing_tzname or parsing_tzoffset: | ||
| ||
| return _return_parsed_timezone_results( | ||
| result, timezones, box) | ||
| except tslib.OutOfBoundsDatetime: | ||
| if errors == 'raise': | ||
| raise | ||
| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -179,6 +179,55 @@ def test_to_datetime_format_weeks(self, cache): | |
| for s, format, dt in data: | ||
| assert to_datetime(s, format=format, cache=cache) == dt | ||
| | ||
| @pytest.mark.parametrize("box,const,assert_equal", [ | ||
| [True, pd.Index, 'assert_index_equal'], | ||
| [False, np.array, 'assert_numpy_array_equal']]) | ||
| @pytest.mark.parametrize("fmt,dates,expected_dates", [ | ||
| ['%Y-%m-%d %H:%M:%S %Z', | ||
| ['2010-01-01 12:00:00 UTC'] * 2, | ||
| [pd.Timestamp('2010-01-01 12:00:00', tz='UTC')] * 2], | ||
| ['%Y-%m-%d %H:%M:%S %Z', | ||
| ['2010-01-01 12:00:00 UTC', | ||
| '2010-01-01 12:00:00 GMT', | ||
| '2010-01-01 12:00:00 US/Pacific'], | ||
| [pd.Timestamp('2010-01-01 12:00:00', tz='UTC'), | ||
| pd.Timestamp('2010-01-01 12:00:00', tz='GMT'), | ||
| pd.Timestamp('2010-01-01 12:00:00', tz='US/Pacific')]], | ||
| ['%Y-%m-%d %H:%M:%S %z', | ||
| Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you one of them, eg this one, without the space before the tz? | ||
| ['2010-01-01 12:00:00 +0100'] * 2, | ||
| [pd.Timestamp('2010-01-01 12:00:00', | ||
| tzinfo=pytz.FixedOffset(60))] * 2], | ||
| ['%Y-%m-%d %H:%M:%S %z', | ||
| ['2010-01-01 12:00:00 +0100', '2010-01-01 12:00:00 -0100'], | ||
| [pd.Timestamp('2010-01-01 12:00:00', | ||
| tzinfo=pytz.FixedOffset(60)), | ||
| pd.Timestamp('2010-01-01 12:00:00', | ||
| tzinfo=pytz.FixedOffset(-60))]], | ||
| ['%Y-%m-%d %H:%M:%S %z', | ||
| ['2010-01-01 12:00:00 Z', '2010-01-01 12:00:00 Z'], | ||
| Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this also work with Member Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The regex I pulled from https://github.com/python/cpython/blob/master/Lib/_strptime.py has an option for 'Z' with But Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK (that's probably a newer addition to python), then it makes sense to follow upstream python to be consistent | ||
| [pd.Timestamp('2010-01-01 12:00:00', | ||
| tzinfo=pytz.FixedOffset(0)), | ||
| ||
| pd.Timestamp('2010-01-01 12:00:00', | ||
| tzinfo=pytz.FixedOffset(0))]]]) | ||
| def test_to_datetime_parse_tzname_or_tzoffset(self, box, const, | ||
| assert_equal, fmt, | ||
| dates, expected_dates): | ||
| # GH 13486 | ||
| result = pd.to_datetime(dates, format=fmt, box=box) | ||
| expected = const(expected_dates) | ||
| getattr(tm, assert_equal)(result, expected) | ||
| | ||
| with pytest.raises(ValueError): | ||
| pd.to_datetime(dates, format=fmt, box=box, utc=True) | ||
| | ||
| @pytest.mark.parametrize('offset', [ | ||
| '+0', '-1foo', 'UTCbar', ':10', '+01:000:01']) | ||
| ||
| def test_to_datetime_parse_timezone_malformed(self, offset): | ||
| fmt = '%Y-%m-%d %H:%M:%S %z' | ||
| date = '2010-01-01 12:00:00 ' + offset | ||
| with pytest.raises(ValueError): | ||
| pd.to_datetime([date], format=fmt) | ||
| | ||
| | ||
| class TestToDatetime(object): | ||
| def test_to_datetime_pydatetime(self): | ||
| | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you could make this a module level variable