BUG: read_csv with custom date parser and na_filter=True results in ValueError

import numpy as np import pandas as pd from io import StringIO def __custom_date_parser(time): time_temp = time.astype(np.float).astype(np.int) # convert float seconds to int type return pd.to_timedelta(time_temp, unit='s') testdata = StringIO("""time e n h 41047.00	-98573.7297	871458.0640	389.0089 41048.00	-98573.7299	871458.0640	389.0089 41049.00	-98573.7300	871458.0642	389.0088 41050.00	-98573.7299	871458.0643	389.0088 41051.00	-98573.7302	871458.0640	389.0086  """) df = pd.read_csv(testdata, delim_whitespace=True, parse_dates=True, date_parser=__custom_date_parser, index_col='time')

I noticed this problem when I executed a piece of old code which has worked before (a few months ago). Normally this code would parse a text file with GPS seconds of week as time and convert it to a TimeDeltaIndex. Now when I execute this, it results in a ValueError: unit abbreviation w/o a number. (Full stack trace below) I tracked it down to the default option na_filter=True in pd.read_csv. When i set it to False everything is working. With a bit of digging I think i found the source of the error in algorithms.py -> _ensure_data -> line 142.

 # datetimelike vals_dtype = getattr(values, "dtype", None) if needs_i8_conversion(vals_dtype) or needs_i8_conversion(dtype): if is_period_dtype(vals_dtype) or is_period_dtype(dtype): from pandas import PeriodIndex values = PeriodIndex(values) dtype = values.dtype elif is_timedelta64_dtype(vals_dtype) or is_timedelta64_dtype(dtype): from pandas import TimedeltaIndex values = TimedeltaIndex(values) #This is line 142 dtype = values.dtype else: # Datetime if values.ndim > 1 and is_datetime64_ns_dtype(vals_dtype): # Avoid calling the DatetimeIndex constructor as it is 1D only # Note: this is reached by DataFrame.rank calls GH#27027 # TODO(EA2D): special case not needed with 2D EAs asi8 = values.view("i8") dtype = values.dtype return asi8, dtype from pandas import DatetimeIndex values = DatetimeIndex(values) dtype = values.dtype

Here the function tries to parse values as TimeDeltaIndex, but values is ['' 'n/a' '-nan' '#N/A' '1.#QNAN' 'nan' '#NA' 'NaN' '-1.#QNAN' '#N/A N/A', '-NaN' 'N/A' 'NULL' '' 'null' '1.#IND' 'NA' '-1.#IND'] in this case. It executes this if statement, because is_timedelta64_dtype(dtype) is true in this case. I can't believe that this is expected behaviour, as it has worked before.

Traceback (most recent call last): File "...\lib\site-packages\pandas\io\parsers.py", line 458, in _read data = parser.read(nrows) File "...\lib\site-packages\pandas\io\parsers.py", line 1186, in read ret = self._engine.read(nrows) File "...\lib\site-packages\pandas\io\parsers.py", line 2221, in read index, names = self._make_index(data, alldata, names) File "...\lib\site-packages\pandas\io\parsers.py", line 1667, in _make_index index = self._agg_index(index) File "...\lib\site-packages\pandas\io\parsers.py", line 1760, in _agg_index arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) File "...\lib\site-packages\pandas\io\parsers.py", line 1861, in _infer_types mask = algorithms.isin(values, list(na_values)) File "...\lib\site-packages\pandas\core\algorithms.py", line 433, in isin values, _ = _ensure_data(values, dtype=dtype) File "...\lib\site-packages\pandas\core\algorithms.py", line 142, in _ensure_data values = TimedeltaIndex(values) File "...\lib\site-packages\pandas\core\indexes\timedeltas.py", line 157, in __new__ data, freq=freq, unit=unit, dtype=dtype, copy=copy File "...\lib\site-packages\pandas\core\arrays\timedeltas.py", line 216, in _from_sequence data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) File "...\lib\site-packages\pandas\core\arrays\timedeltas.py", line 930, in sequence_to_td64ns data = objects_to_td64ns(data, unit=unit, errors=errors) File "...\lib\site-packages\pandas\core\arrays\timedeltas.py", line 1040, in objects_to_td64ns result = array_to_timedelta64(values, unit=unit, errors=errors) File "pandas\_libs\tslibs\timedeltas.pyx", line 273, in pandas._libs.tslibs.timedeltas.array_to_timedelta64 File "pandas\_libs\tslibs\timedeltas.pyx", line 268, in pandas._libs.tslibs.timedeltas.array_to_timedelta64 File "pandas\_libs\tslibs\timedeltas.pyx", line 215, in pandas._libs.tslibs.timedeltas.convert_to_timedelta64 File "pandas\_libs\tslibs\timedeltas.pyx", line 428, in pandas._libs.tslibs.timedeltas.parse_timedelta_string ValueError: unit abbreviation w/o a number python-BaseException

INSTALLED VERSIONS

commit : f2ca0a2
python : 3.7.9.final.0
python-bits : 64
OS : Windows
OS-release : 10
Version : 10.0.18362
machine : AMD64
processor : Intel64 Family 6 Model 94 Stepping 3, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.None
pandas : 1.1.1
numpy : 1.18.1
pytz : 2020.1
dateutil : 2.8.1
pip : 20.2.2
setuptools : 49.6.0.post20200814
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : None
IPython : None
pandas_datareader: None
bs4 : None
bottleneck : None
fsspec : None
fastparquet : None
gcsfs : None
matplotlib : 3.1.2
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
numba : None

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

BUG: read_csv with custom date parser and na_filter=True results in ValueError #36111

INSTALLED VERSIONS

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

BUG: read_csv with custom date parser and na_filter=True results in ValueError #36111

Description

INSTALLED VERSIONS

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions