-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
Description
import numpy as np import pandas as pd from io import StringIO def __custom_date_parser(time): time_temp = time.astype(np.float).astype(np.int) # convert float seconds to int type return pd.to_timedelta(time_temp, unit='s') testdata = StringIO("""time e n h 41047.00 -98573.7297 871458.0640 389.0089 41048.00 -98573.7299 871458.0640 389.0089 41049.00 -98573.7300 871458.0642 389.0088 41050.00 -98573.7299 871458.0643 389.0088 41051.00 -98573.7302 871458.0640 389.0086 """) df = pd.read_csv(testdata, delim_whitespace=True, parse_dates=True, date_parser=__custom_date_parser, index_col='time')I noticed this problem when I executed a piece of old code which has worked before (a few months ago). Normally this code would parse a text file with GPS seconds of week as time and convert it to a TimeDeltaIndex. Now when I execute this, it results in a ValueError: unit abbreviation w/o a number. (Full stack trace below) I tracked it down to the default option na_filter=True in pd.read_csv. When i set it to False everything is working. With a bit of digging I think i found the source of the error in algorithms.py -> _ensure_data -> line 142.
# datetimelike vals_dtype = getattr(values, "dtype", None) if needs_i8_conversion(vals_dtype) or needs_i8_conversion(dtype): if is_period_dtype(vals_dtype) or is_period_dtype(dtype): from pandas import PeriodIndex values = PeriodIndex(values) dtype = values.dtype elif is_timedelta64_dtype(vals_dtype) or is_timedelta64_dtype(dtype): from pandas import TimedeltaIndex values = TimedeltaIndex(values) #This is line 142 dtype = values.dtype else: # Datetime if values.ndim > 1 and is_datetime64_ns_dtype(vals_dtype): # Avoid calling the DatetimeIndex constructor as it is 1D only # Note: this is reached by DataFrame.rank calls GH#27027 # TODO(EA2D): special case not needed with 2D EAs asi8 = values.view("i8") dtype = values.dtype return asi8, dtype from pandas import DatetimeIndex values = DatetimeIndex(values) dtype = values.dtypeHere the function tries to parse values as TimeDeltaIndex, but values is ['' 'n/a' '-nan' '#N/A' '1.#QNAN' 'nan' '#NA' 'NaN' '-1.#QNAN' '#N/A N/A', '-NaN' 'N/A' 'NULL' '' 'null' '1.#IND' 'NA' '-1.#IND'] in this case. It executes this if statement, because is_timedelta64_dtype(dtype) is true in this case. I can't believe that this is expected behaviour, as it has worked before.
Traceback (most recent call last): File "...\lib\site-packages\pandas\io\parsers.py", line 458, in _read data = parser.read(nrows) File "...\lib\site-packages\pandas\io\parsers.py", line 1186, in read ret = self._engine.read(nrows) File "...\lib\site-packages\pandas\io\parsers.py", line 2221, in read index, names = self._make_index(data, alldata, names) File "...\lib\site-packages\pandas\io\parsers.py", line 1667, in _make_index index = self._agg_index(index) File "...\lib\site-packages\pandas\io\parsers.py", line 1760, in _agg_index arr, _ = self._infer_types(arr, col_na_values | col_na_fvalues) File "...\lib\site-packages\pandas\io\parsers.py", line 1861, in _infer_types mask = algorithms.isin(values, list(na_values)) File "...\lib\site-packages\pandas\core\algorithms.py", line 433, in isin values, _ = _ensure_data(values, dtype=dtype) File "...\lib\site-packages\pandas\core\algorithms.py", line 142, in _ensure_data values = TimedeltaIndex(values) File "...\lib\site-packages\pandas\core\indexes\timedeltas.py", line 157, in __new__ data, freq=freq, unit=unit, dtype=dtype, copy=copy File "...\lib\site-packages\pandas\core\arrays\timedeltas.py", line 216, in _from_sequence data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) File "...\lib\site-packages\pandas\core\arrays\timedeltas.py", line 930, in sequence_to_td64ns data = objects_to_td64ns(data, unit=unit, errors=errors) File "...\lib\site-packages\pandas\core\arrays\timedeltas.py", line 1040, in objects_to_td64ns result = array_to_timedelta64(values, unit=unit, errors=errors) File "pandas\_libs\tslibs\timedeltas.pyx", line 273, in pandas._libs.tslibs.timedeltas.array_to_timedelta64 File "pandas\_libs\tslibs\timedeltas.pyx", line 268, in pandas._libs.tslibs.timedeltas.array_to_timedelta64 File "pandas\_libs\tslibs\timedeltas.pyx", line 215, in pandas._libs.tslibs.timedeltas.convert_to_timedelta64 File "pandas\_libs\tslibs\timedeltas.pyx", line 428, in pandas._libs.tslibs.timedeltas.parse_timedelta_string ValueError: unit abbreviation w/o a number python-BaseExceptionINSTALLED VERSIONS
commit : f2ca0a2
python : 3.7.9.final.0
python-bits : 64
OS : Windows
OS-release : 10
Version : 10.0.18362
machine : AMD64
processor : Intel64 Family 6 Model 94 Stepping 3, GenuineIntel
byteorder : little
LC_ALL : None
LANG : None
LOCALE : None.None
pandas : 1.1.1
numpy : 1.18.1
pytz : 2020.1
dateutil : 2.8.1
pip : 20.2.2
setuptools : 49.6.0.post20200814
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : None
IPython : None
pandas_datareader: None
bs4 : None
bottleneck : None
fsspec : None
fastparquet : None
gcsfs : None
matplotlib : 3.1.2
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pytables : None
pyxlsb : None
s3fs : None
scipy : 1.4.1
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
numba : None