pandas-dev · mroeschke · Jun 14, 2022 · Oct 3, 2021 · Oct 3, 2021 · Oct 3, 2021
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -373,6 +373,7 @@ Serialization / IO / conversion
 
  DataFrame.from_dict
  DataFrame.from_records
+ DataFrame.to_orc
  DataFrame.to_parquet
  DataFrame.to_pickle
  DataFrame.to_csv

diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst
@@ -159,6 +159,7 @@ ORC
  :toctree: api/
 
  read_orc
+ DataFrame.to_orc
 
 SAS
 ~~~

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -30,7 +30,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
  binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
  binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
  binary;`Parquet Format <https://parquet.apache.org/>`__;:ref:`read_parquet<io.parquet>`;:ref:`to_parquet<io.parquet>`
- binary;`ORC Format <https://orc.apache.org/>`__;:ref:`read_orc<io.orc>`;
+ binary;`ORC Format <https://orc.apache.org/>`__;:ref:`read_orc<io.orc>`;:ref:`to_orc<io.orc>`
  binary;`Stata <https://en.wikipedia.org/wiki/Stata>`__;:ref:`read_stata<io.stata_reader>`;:ref:`to_stata<io.stata_writer>`
  binary;`SAS <https://en.wikipedia.org/wiki/SAS_(software)>`__;:ref:`read_sas<io.sas_reader>`;
  binary;`SPSS <https://en.wikipedia.org/wiki/SPSS>`__;:ref:`read_spss<io.spss_reader>`;
@@ -5562,13 +5562,64 @@ ORC
 .. versionadded:: 1.0.0
 
 Similar to the :ref:`parquet <io.parquet>` format, the `ORC Format <https://orc.apache.org/>`__ is a binary columnar serialization
-for data frames. It is designed to make reading data frames efficient. pandas provides *only* a reader for the
-ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow <https://arrow.apache.org/docs/python/>`__ library.
+for data frames. It is designed to make reading data frames efficient. pandas provides both the reader and the writer for the
+ORC format, :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc`. This requires the `pyarrow <https://arrow.apache.org/docs/python/>`__ library.
 
 .. warning::
 
  * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow.
- * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies <install.warn_orc>`.
+ * :func:`~pandas.DataFrame.to_orc` requires pyarrow>=7.0.0.
+ * :func:`~pandas.read_orc` and :func:`~pandas.DataFrame.to_orc` are not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies <install.warn_orc>`.
+ * For supported dtypes please refer to `supported ORC features in Arrow <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
+ * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files.
+
+.. ipython:: python
+
+ df = pd.DataFrame(
+ {
+ "a": list("abc"),
+ "b": list(range(1, 4)),
+ "c": np.arange(4.0, 7.0, dtype="float64"),
+ "d": [True, False, True],
+ "e": pd.date_range("20130101", periods=3),
+ }
+ )
+
+ df
+ df.dtypes
+
+Write to an orc file.
+
+.. ipython:: python
+ :okwarning:
+
+ df.to_orc("example_pa.orc", engine="pyarrow")
+
+Read from an orc file.
+
+.. ipython:: python
+ :okwarning:
+
+ result = pd.read_orc("example_pa.orc")
+
+ result.dtypes
+
+Read only certain columns of an orc file.
+
+.. ipython:: python
+
+ result = pd.read_orc(
+ "example_pa.orc",
+ columns=["a", "b"],
+ )
+ result.dtypes
+
+
+.. ipython:: python
+ :suppress:
+
+ os.remove("example_pa.orc")
+
 
 .. _io.sql:
 

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -100,6 +100,28 @@ as seen in the following example.
  1 2021-01-02 08:00:00 4
  2 2021-01-02 16:00:00 5
 
+.. _whatsnew_150.enhancements.orc:
+
+Writing to ORC files
+^^^^^^^^^^^^^^^^^^^^
+
+The new method :meth:`DataFrame.to_orc` allows writing to ORC files (:issue:`43864`).
+
+This functionality depends the `pyarrow <http://arrow.apache.org/docs/python/>`__ library. For more details, see :ref:`the IO docs on ORC <io.orc>`.
+
+.. warning::
+
+ * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow.
+ * :func:`~pandas.DataFrame.to_orc` requires pyarrow>=7.0.0.
+ * :func:`~pandas.DataFrame.to_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies <install.warn_orc>`.
+ * For supported dtypes please refer to `supported ORC features in Arrow <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
+ * Currently timezones in datetime columns are not preserved when a dataframe is converted into ORC files.
+
+.. code-block:: python
+
+ df = pd.DataFrame(data={"col1": [1, 2], "col2": [3, 4]})
+ df.to_orc("./out.orc")
+
 .. _whatsnew_150.enhancements.tar:
 
 Reading directly from TAR archives

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2858,6 +2858,7 @@ def to_parquet(
  See Also
  --------
  read_parquet : Read a parquet file.
+ DataFrame.to_orc : Write an orc file.
  DataFrame.to_csv : Write a csv file.
  DataFrame.to_sql : Write to a sql table.
  DataFrame.to_hdf : Write to hdf.
@@ -2901,6 +2902,93 @@ def to_parquet(
  **kwargs,
  )
 
+ def to_orc(
+ self,
+ path: FilePath | WriteBuffer[bytes] | None = None,
+ *,
+ engine: Literal["pyarrow"] = "pyarrow",
+ index: bool | None = None,
+ engine_kwargs: dict[str, Any] | None = None,
+ ) -> bytes | None:
+ """
+ Write a DataFrame to the ORC format.
+
+ .. versionadded:: 1.5.0
+
+ Parameters
+ ----------
+ path : str, file-like object or None, default None
+ If a string, it will be used as Root Directory path
+ when writing a partitioned dataset. By file-like object,
+ we refer to objects with a write() method, such as a file handle
+ (e.g. via builtin open function). If path is None,
+ a bytes object is returned.
+ engine : str, default 'pyarrow'
+ ORC library to use. Pyarrow must be >= 7.0.0.
+ index : bool, optional
+ If ``True``, include the dataframe's index(es) in the file output.
+ If ``False``, they will not be written to the file.
+ If ``None``, similar to ``infer`` the dataframe's index(es)
+ will be saved. However, instead of being saved as values,
+ the RangeIndex will be stored as a range in the metadata so it
+ doesn't require much space and is faster. Other indexes will
+ be included as columns in the file output.
+ engine_kwargs : dict[str, Any] or None, default None
+ Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
+
+ Returns
+ -------
+ bytes if no path argument is provided else None
+
+ Raises
+ ------
+ NotImplementedError
+ Dtype of one or more columns is category, unsigned integers, interval,
+ period or sparse.
+ ValueError
+ engine is not pyarrow.
+
+ See Also
+ --------
+ read_orc : Read a ORC file.
+ DataFrame.to_parquet : Write a parquet file.
+ DataFrame.to_csv : Write a csv file.
+ DataFrame.to_sql : Write to a sql table.
+ DataFrame.to_hdf : Write to hdf.
+
+ Notes
+ -----
+ * Before using this function you should read the :ref:`user guide about
+ ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.
+ * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
+ library.
+ * For supported dtypes please refer to `supported ORC features in Arrow
+ <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
+ * Currently timezones in datetime columns are not preserved when a
+ dataframe is converted into ORC files.
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
+ >>> df.to_orc('df.orc') # doctest: +SKIP
+ >>> pd.read_orc('df.orc') # doctest: +SKIP
+ col1 col2
+ 0 1 4
+ 1 2 3
+
+ If you want to get a buffer to the orc content you can write it to io.BytesIO
+ >>> import io
+ >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
+ >>> b.seek(0) # doctest: +SKIP
+ 0
+ >>> content = b.read() # doctest: +SKIP
+ """
+ from pandas.io.orc import to_orc
+
+ return to_orc(
+ self, path, engine=engine, index=index, engine_kwargs=engine_kwargs
+ )
+
  @Substitution(
  header_type="bool",
  header="Whether to print column labels, default True",

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2629,6 +2629,7 @@ def to_hdf(
  See Also
  --------
  read_hdf : Read from HDF file.
+ DataFrame.to_orc : Write a DataFrame to the binary orc format.
  DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
  DataFrame.to_sql : Write to a SQL table.
  DataFrame.to_feather : Write out feather-format for DataFrames.

diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -1,14 +1,28 @@
 """ orc compat """
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+import io
+from types import ModuleType
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Literal,
+)
 
 from pandas._typing import (
  FilePath,
  ReadBuffer,
+ WriteBuffer,
 )
 from pandas.compat._optional import import_optional_dependency
 
+from pandas.core.dtypes.common import (
+ is_categorical_dtype,
+ is_interval_dtype,
+ is_period_dtype,
+ is_unsigned_integer_dtype,
+)
+
 from pandas.io.common import get_handle
 
 if TYPE_CHECKING:
@@ -52,3 +66,111 @@ def read_orc(
  with get_handle(path, "rb", is_text=False) as handles:
  orc_file = orc.ORCFile(handles.handle)
  return orc_file.read(columns=columns, **kwargs).to_pandas()
+
+
+def to_orc(
+ df: DataFrame,
+ path: FilePath | WriteBuffer[bytes] | None = None,
+ *,
+ engine: Literal["pyarrow"] = "pyarrow",
+ index: bool | None = None,
+ engine_kwargs: dict[str, Any] | None = None,
+) -> bytes | None:
+ """
+ Write a DataFrame to the ORC format.
+
+ .. versionadded:: 1.5.0
+
+ Parameters
+ ----------
+ df : DataFrame
+ The dataframe to be written to ORC. Raises NotImplementedError
+ if dtype of one or more columns is category, unsigned integers,
+ intervals, periods or sparse.
+ path : str, file-like object or None, default None
+ If a string, it will be used as Root Directory path
+ when writing a partitioned dataset. By file-like object,
+ we refer to objects with a write() method, such as a file handle
+ (e.g. via builtin open function). If path is None,
+ a bytes object is returned.
+ engine : str, default 'pyarrow'
+ ORC library to use. Pyarrow must be >= 7.0.0.
+ index : bool, optional
+ If ``True``, include the dataframe's index(es) in the file output. If
+ ``False``, they will not be written to the file.
+ If ``None``, similar to ``infer`` the dataframe's index(es)
+ will be saved. However, instead of being saved as values,
+ the RangeIndex will be stored as a range in the metadata so it
+ doesn't require much space and is faster. Other indexes will
+ be included as columns in the file output.
+ engine_kwargs : dict[str, Any] or None, default None
+ Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
+
+ Returns
+ -------
+ bytes if no path argument is provided else None
+
+ Raises
+ ------
+ NotImplementedError
+ Dtype of one or more columns is category, unsigned integers, interval,
+ period or sparse.
+ ValueError
+ engine is not pyarrow.
+
+ Notes
+ -----
+ * Before using this function you should read the
+ :ref:`user guide about ORC <io.orc>` and
+ :ref:`install optional dependencies <install.warn_orc>`.
+ * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
+ library.
+ * For supported dtypes please refer to `supported ORC features in Arrow
+ <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
+ * Currently timezones in datetime columns are not preserved when a
+ dataframe is converted into ORC files.
+ """
+ if index is None:
+ index = df.index.names[0] is not None
+ if engine_kwargs is None:
+ engine_kwargs = {}
+
+ # If unsupported dtypes are found raise NotImplementedError
+ # In Pyarrow 9.0.0 this check will no longer be needed
+ for dtype in df.dtypes:
+ if (
+ is_categorical_dtype(dtype)
+ or is_interval_dtype(dtype)
+ or is_period_dtype(dtype)
+ or is_unsigned_integer_dtype(dtype)
+ ):
+ raise NotImplementedError(
+ "The dtype of one or more columns is not supported yet."
+ )
+
+ if engine != "pyarrow":
+ raise ValueError("engine must be 'pyarrow'")
+ engine = import_optional_dependency(engine, min_version="7.0.0")
+ orc = import_optional_dependency("pyarrow.orc")
+
+ was_none = path is None
+ if was_none:
+ path = io.BytesIO()
+ assert path is not None # For mypy
+ with get_handle(path, "wb", is_text=False) as handles:
+ assert isinstance(engine, ModuleType) # For mypy
+ try:
+ orc.write_table(
+ engine.Table.from_pandas(df, preserve_index=index),
+ handles.handle,
+ **engine_kwargs,
+ )
+ except TypeError as e:
+ raise NotImplementedError(
+ "The dtype of one or more columns is not supported yet."
+ ) from e
+
+ if was_none:
+ assert isinstance(path, io.BytesIO) # For mypy
+ return path.getvalue()
+ return None
-Original file line number
+Diff line change
@@ Expand Up / @@ -159,6 +159,7 @@ ORC @@
   :toctree: api/
   read_orc
+  DataFrame.to_orc
  SAS
  ~~~
@@ Expand Down @@