Add BytesIO support & test

pandas-dev · mroeschke · Jun 14, 2022 · Oct 3, 2021 · Oct 3, 2021 · Oct 3, 2021
commit 045c411d8640a002e2463c1df1b0ced498ca3bd9
diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst
@@ -18,23 +18,6 @@ tool for all situations. If you're working with very large datasets and a tool
 like PostgreSQL fits your needs, then you should probably be using that.
 Assuming you want or need the expressiveness and power of pandas, let's carry on.
 
-.. ipython:: python
-
- import pandas as pd
- import numpy as np
-
-.. ipython:: python
- :suppress:
-
- from pandas._testing import _make_timeseries
-
- # Make a random in-memory dataset
- ts = _make_timeseries(freq="30S", seed=0)
- ts.to_csv("timeseries.csv")
- ts.to_orc("timeseries.orc")
- ts.to_parquet("timeseries.parquet")
-
-
 Load less data
 --------------
 

diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -1,6 +1,7 @@
 """ orc compat """
 from __future__ import annotations
 
+import io
 from typing import (
  TYPE_CHECKING,
  Literal,
@@ -100,6 +101,14 @@ def to_orc(
  raise ValueError("engine must be 'pyarrow'")
  engine = import_optional_dependency(engine, min_version="7.0.0")
 
+ path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
  engine.orc.write_table(
- engine.Table.from_pandas(df, preserve_index=index), path, **kwargs
+ engine.Table.from_pandas(df, preserve_index=index), path_or_buf, **kwargs
  )
+
+ if path is None:
+ assert isinstance(path_or_buf, io.BytesIO)
+ return path_or_buf.getvalue()
+ else:
+ return None
+
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
@@ -226,7 +226,7 @@ def test_orc_reader_snappy_compressed(dirpath):
  tm.assert_equal(expected, got)
 
 
-def test_orc_roundtrip(dirpath):
+def test_orc_roundtrip_file(dirpath):
  # GH44554
  # PyArrow gained ORC write support with the current argument order
  pytest.importorskip("pyarrow", minversion="7.0.0")
@@ -248,3 +248,26 @@ def test_orc_roundtrip(dirpath):
  got = read_orc(outputfile)
 
  tm.assert_equal(expected, got)
+
+
+def test_orc_roundtrip_bytesio():
+ # GH44554
+ # PyArrow gained ORC write support with the current argument order
+ pytest.importorskip("pyarrow", minversion="7.0.0")
+ data = {
+ "boolean1": np.array([False, True], dtype="bool"),
+ "byte1": np.array([1, 100], dtype="int8"),
+ "short1": np.array([1024, 2048], dtype="int16"),
+ "int1": np.array([65536, 65536], dtype="int32"),
+ "long1": np.array([9223372036854775807, 9223372036854775807], dtype="int64"),
+ "float1": np.array([1.0, 2.0], dtype="float32"),
+ "double1": np.array([-15.0, -5.0], dtype="float64"),
+ "bytes1": np.array([b"\x00\x01\x02\x03\x04", b""], dtype="object"),
+ "string1": np.array(["hi", "bye"], dtype="object"),
+ }
+ expected = pd.DataFrame.from_dict(data)
+
+ bytesio = expected.to_orc()
+ got = read_orc(bytesio)
+
+ tm.assert_equal(expected, got)