-
- Notifications
You must be signed in to change notification settings - Fork 19.4k
[EHN] pandas.DataFrame.to_orc #44554
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
9a7b29a d11026f 0146ac3 0571602 d970b58 8b12e9f 65e6b7a 2114616 e4b40ef a7aa3e0 1ab9b6c 96969d5 2a54b8c 1caec9e 6f0a538 ae65214 045c411 c00ed0f fe275d7 9d3e0df 971f31c 52b68a0 76437ba c5d5852 b5cd022 7ad3df9 a73bb70 20aefe7 e7e81fe 6b659f7 18e5429 21cba6e c7bf39f e43c6dd afa0a8a cd585e6 b509c3c 1001002 55cab6e 89283e0 989468a a7fca36 7fc338c 91d1556 a28c5a8 162e5bb b230583 e16edab e4770b8 File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| | @@ -159,6 +159,7 @@ ORC | |
| :toctree: api/ | ||
| | ||
| read_orc | ||
| DataFrame.to_orc | ||
| | ||
| SAS | ||
| ~~~ | ||
| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,14 +1,28 @@ | ||
| """ orc compat """ | ||
| from __future__ import annotations | ||
| | ||
| from typing import TYPE_CHECKING | ||
| import io | ||
| from types import ModuleType | ||
| from typing import ( | ||
| TYPE_CHECKING, | ||
| Any, | ||
| Literal, | ||
| ) | ||
| | ||
| from pandas._typing import ( | ||
| FilePath, | ||
| ReadBuffer, | ||
| WriteBuffer, | ||
| ) | ||
| from pandas.compat._optional import import_optional_dependency | ||
| | ||
| from pandas.core.dtypes.common import ( | ||
| is_categorical_dtype, | ||
| is_interval_dtype, | ||
| is_period_dtype, | ||
| is_unsigned_integer_dtype, | ||
| ) | ||
| | ||
| from pandas.io.common import get_handle | ||
| | ||
| if TYPE_CHECKING: | ||
| | @@ -52,3 +66,111 @@ def read_orc( | |
| with get_handle(path, "rb", is_text=False) as handles: | ||
| orc_file = orc.ORCFile(handles.handle) | ||
| return orc_file.read(columns=columns, **kwargs).to_pandas() | ||
| | ||
| | ||
| def to_orc( | ||
| df: DataFrame, | ||
| path: FilePath | WriteBuffer[bytes] | None = None, | ||
twoertwein marked this conversation as resolved. Show resolved Hide resolved | ||
| *, | ||
| engine: Literal["pyarrow"] = "pyarrow", | ||
twoertwein marked this conversation as resolved. Show resolved Hide resolved | ||
| index: bool | None = None, | ||
| engine_kwargs: dict[str, Any] | None = None, | ||
| ) -> bytes | None: | ||
| """ | ||
| Write a DataFrame to the ORC format. | ||
| | ||
| .. versionadded:: 1.5.0 | ||
| | ||
| Parameters | ||
| ---------- | ||
| df : DataFrame | ||
| The dataframe to be written to ORC. Raises NotImplementedError | ||
| if dtype of one or more columns is category, unsigned integers, | ||
| intervals, periods or sparse. | ||
| path : str, file-like object or None, default None | ||
| If a string, it will be used as Root Directory path | ||
| when writing a partitioned dataset. By file-like object, | ||
| we refer to objects with a write() method, such as a file handle | ||
| (e.g. via builtin open function). If path is None, | ||
| a bytes object is returned. | ||
| engine : str, default 'pyarrow' | ||
| ORC library to use. Pyarrow must be >= 7.0.0. | ||
| index : bool, optional | ||
| If ``True``, include the dataframe's index(es) in the file output. If | ||
| ``False``, they will not be written to the file. | ||
| If ``None``, similar to ``infer`` the dataframe's index(es) | ||
| will be saved. However, instead of being saved as values, | ||
| the RangeIndex will be stored as a range in the metadata so it | ||
| doesn't require much space and is faster. Other indexes will | ||
| be included as columns in the file output. | ||
| engine_kwargs : dict[str, Any] or None, default None | ||
| Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. | ||
| | ||
| Returns | ||
| ------- | ||
| bytes if no path argument is provided else None | ||
| | ||
| Raises | ||
| ------ | ||
| NotImplementedError | ||
| Dtype of one or more columns is category, unsigned integers, interval, | ||
| period or sparse. | ||
| ValueError | ||
| engine is not pyarrow. | ||
| | ||
| Notes | ||
| ----- | ||
| * Before using this function you should read the | ||
| :ref:`user guide about ORC <io.orc>` and | ||
| :ref:`install optional dependencies <install.warn_orc>`. | ||
| * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ | ||
| library. | ||
| * For supported dtypes please refer to `supported ORC features in Arrow | ||
| <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. | ||
| * Currently timezones in datetime columns are not preserved when a | ||
| dataframe is converted into ORC files. | ||
| """ | ||
| if index is None: | ||
| index = df.index.names[0] is not None | ||
| if engine_kwargs is None: | ||
| engine_kwargs = {} | ||
| | ||
| # If unsupported dtypes are found raise NotImplementedError | ||
| # In Pyarrow 9.0.0 this check will no longer be needed | ||
| for dtype in df.dtypes: | ||
| if ( | ||
| Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will pyarrow raise if these dtypes are passed? If so, can a a pyarrow error be caught and reraised as a Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I need to test these types individually. Not sure right now. Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mroeschke It seg faults out for all instances but sparse. I need to catch them in Arrow 9.0.0. Meanwhile can we use the current dtype filter? Member There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay, this is fine then given:
Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure! Contributor Author There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! Since for sparse dtypes we get a | ||
| is_categorical_dtype(dtype) | ||
| or is_interval_dtype(dtype) | ||
| or is_period_dtype(dtype) | ||
| or is_unsigned_integer_dtype(dtype) | ||
| ): | ||
| raise NotImplementedError( | ||
| "The dtype of one or more columns is not supported yet." | ||
| ) | ||
| | ||
| if engine != "pyarrow": | ||
| raise ValueError("engine must be 'pyarrow'") | ||
| engine = import_optional_dependency(engine, min_version="7.0.0") | ||
| orc = import_optional_dependency("pyarrow.orc") | ||
| | ||
| was_none = path is None | ||
| if was_none: | ||
| path = io.BytesIO() | ||
| assert path is not None # For mypy | ||
| with get_handle(path, "wb", is_text=False) as handles: | ||
| assert isinstance(engine, ModuleType) # For mypy | ||
| try: | ||
| orc.write_table( | ||
| engine.Table.from_pandas(df, preserve_index=index), | ||
| handles.handle, | ||
| **engine_kwargs, | ||
| ) | ||
| except TypeError as e: | ||
| raise NotImplementedError( | ||
| "The dtype of one or more columns is not supported yet." | ||
| ) from e | ||
| | ||
| if was_none: | ||
| assert isinstance(path, io.BytesIO) # For mypy | ||
| return path.getvalue() | ||
| return None | ||
Uh oh!
There was an error while loading. Please reload this page.