pandas-dev · jreback · Jul 3, 2019 · Feb 27, 2019 · Feb 28, 2019 · Feb 28, 2019
diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml
@@ -16,6 +16,7 @@ dependencies:
  - nomkl
  - numexpr
  - numpy=1.15.*
+ - odfpy
  - openpyxl
  - pandas-gbq
  # https://github.com/pydata/pandas-gbq/issues/271

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -32,6 +32,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
  text;`HTML <https://en.wikipedia.org/wiki/HTML>`__;:ref:`read_html<io.read_html>`;:ref:`to_html<io.html>`
  text; Local clipboard;:ref:`read_clipboard<io.clipboard>`;:ref:`to_clipboard<io.clipboard>`
  binary;`MS Excel <https://en.wikipedia.org/wiki/Microsoft_Excel>`__;:ref:`read_excel<io.excel_reader>`;:ref:`to_excel<io.excel_writer>`
+ binary;`OpenDocument <http://www.opendocumentformat.org>`__;:ref:`read_excel<io.ods>`;
  binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
  binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
  binary;`Parquet Format <https://parquet.apache.org/>`__;:ref:`read_parquet<io.parquet>`;:ref:`to_parquet<io.parquet>`
@@ -2779,9 +2780,10 @@ parse HTML tables in the top-level pandas io function ``read_html``.
 Excel files
 -----------
 
-The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) and
-Excel 2007+ (``.xlsx``) files using the ``xlrd`` Python
-module. The :meth:`~DataFrame.to_excel` instance method is used for
+The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``)
+files using the ``xlrd`` Python module. Excel 2007+ (``.xlsx``) files
+can be read using either ``xlrd`` or ``openpyxl``.
+The :meth:`~DataFrame.to_excel` instance method is used for
 saving a ``DataFrame`` to Excel. Generally the semantics are
 similar to working with :ref:`csv<io.read_csv_table>` data.
 See the :ref:`cookbook<cookbook.excel>` for some advanced strategies.
@@ -3217,7 +3219,20 @@ The look and feel of Excel worksheets created from pandas can be modified using
 * ``float_format`` : Format string for floating point numbers (default ``None``).
 * ``freeze_panes`` : A tuple of two integers representing the bottommost row and rightmost column to freeze. Each of these parameters is one-based, so (1, 1) will freeze the first row and first column (default ``None``).
 
+.. _io.ods:
 
+OpenDocument Spreadsheets
+-------------------------
+
+The :func:`~pandas.read_excel` method can also read OpenDocument spreadsheets
+using the ``odfpy`` module. The semantics and features for reading
+OpenDocument spreadsheets match what can be done for `Excel files`_ using
+``engine='odf'``.
+
+.. note::
+
+ Currently pandas only supports *reading* OpenDocument spreadsheets. Writing
+ is not implemented.
 
 .. _io.clipboard:
 

diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -164,6 +164,7 @@ Other enhancements
 - Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '<backend-module>')`` where ``<backend-module`` is a library implementing the pandas plotting API (:issue:`14130`)
 - :class:`pandas.offsets.BusinessHour` supports multiple opening hours intervals (:issue:`15481`)
 - :func:`read_excel` can now use ``openpyxl`` to read Excel files via the ``engine='openpyxl'`` argument. This will become the default in a future release (:issue:`11499`)
+- :func:`pandas.io.excel.read_excel` supports reading OpenDocument tables. Specify ``engine='odf'`` to enable. (:issue:`9070`)
 
 .. _whatsnew_0250.api_breaking:
 

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -13,6 +13,7 @@
  "lxml.etree": "3.8.0",
  "matplotlib": "2.2.2",
  "numexpr": "2.6.2",
+ "odfpy": "1.3.0",
  "openpyxl": "2.4.8",
  "pandas_gbq": "0.8.0",
  "pyarrow": "0.9.0",

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -422,6 +422,7 @@ def use_inf_as_na_cb(key):
 _xls_options = ['xlrd']
 _xlsm_options = ['xlrd', 'openpyxl']
 _xlsx_options = ['xlrd', 'openpyxl']
+_ods_options = ['odf']
 
 
 with cf.config_prefix("io.excel.xls"):
@@ -447,6 +448,14 @@ def use_inf_as_na_cb(key):
  validator=str)
 
 
+with cf.config_prefix("io.excel.ods"):
+ cf.register_option("reader", "auto",
+ reader_engine_doc.format(
+ ext='ods',
+ others=', '.join(_ods_options)),
+ validator=str)
+
+
 # Set up the io.excel specific writer configuration.
 writer_engine_doc = """
 : string

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -768,12 +768,14 @@ class ExcelFile:
  Acceptable values are None or ``xlrd``.
  """
 
- from pandas.io.excel._xlrd import _XlrdReader
+ from pandas.io.excel._odfreader import _ODFReader
  from pandas.io.excel._openpyxl import _OpenpyxlReader
+ from pandas.io.excel._xlrd import _XlrdReader
 
  _engines = {
  'xlrd': _XlrdReader,
  'openpyxl': _OpenpyxlReader,
+ 'odf': _ODFReader,
  }
 
  def __init__(self, io, engine=None):

diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py
@@ -0,0 +1,176 @@
+from typing import List
+
+from pandas.compat._optional import import_optional_dependency
+
+import pandas as pd
+from pandas._typing import FilePathOrBuffer, Scalar
+
+from pandas.io.excel._base import _BaseExcelReader
+
+
+class _ODFReader(_BaseExcelReader):
+ """Read tables out of OpenDocument formatted files
+
+ Parameters
+ ----------
+ filepath_or_buffer: string, path to be parsed or
+ an open readable stream.
+ """
+ def __init__(self, filepath_or_buffer: FilePathOrBuffer):
+ import_optional_dependency("odf")
+ super().__init__(filepath_or_buffer)
+
+ @property
+ def _workbook_class(self):
+ from odf.opendocument import OpenDocument
+ return OpenDocument
+
+ def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
+ from odf.opendocument import load
+ return load(filepath_or_buffer)
+
+ @property
+ def empty_value(self) -> str:
+ """Property for compat with other readers."""
+ return ''
+
+ @property
+ def sheet_names(self) -> List[str]:
+ """Return a list of sheet names present in the document"""
+ from odf.table import Table
+
+ tables = self.book.getElementsByType(Table)
+ return [t.getAttribute("name") for t in tables]
+
+ def get_sheet_by_index(self, index: int):
+ from odf.table import Table
+ tables = self.book.getElementsByType(Table)
+ return tables[index]
+
+ def get_sheet_by_name(self, name: str):
+ from odf.table import Table
+
+ tables = self.book.getElementsByType(Table)
+
+ for table in tables:
+ if table.getAttribute("name") == name:
+ return table
+
+ raise ValueError("sheet {name} not found".format(name))
+
+ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+ """Parse an ODF Table into a list of lists
+ """
+ from odf.table import CoveredTableCell, TableCell, TableRow
+
+ covered_cell_name = CoveredTableCell().qname
+ table_cell_name = TableCell().qname
+ cell_names = {covered_cell_name, table_cell_name}
+
+ sheet_rows = sheet.getElementsByType(TableRow)
+ empty_rows = 0
+ max_row_len = 0
+
+ table = [] # type: List[List[Scalar]]
+
+ for i, sheet_row in enumerate(sheet_rows):
+ sheet_cells = [x for x in sheet_row.childNodes
+ if x.qname in cell_names]
+ empty_cells = 0
+ table_row = [] # type: List[Scalar]
+
+ for j, sheet_cell in enumerate(sheet_cells):
+ if sheet_cell.qname == table_cell_name:
+ value = self._get_cell_value(sheet_cell, convert_float)
+ else:
+ value = self.empty_value
+
+ column_repeat = self._get_column_repeat(sheet_cell)
+
+ # Queue up empty values, writing only if content succeeds them
+ if value == self.empty_value:
+ empty_cells += column_repeat
+ else:
+ table_row.extend([self.empty_value] * empty_cells)
+ empty_cells = 0
+ table_row.extend([value] * column_repeat)
+
+ if max_row_len < len(table_row):
+ max_row_len = len(table_row)
+
+ row_repeat = self._get_row_repeat(sheet_row)
+ if self._is_empty_row(sheet_row):
+ empty_rows += row_repeat
+ else:
+ # add blank rows to our table
+ table.extend([[self.empty_value]] * empty_rows)
+ empty_rows = 0
+ for _ in range(row_repeat):
+ table.append(table_row)
+
+ # Make our table square
+ for row in table:
+ if len(row) < max_row_len:
+ row.extend([self.empty_value] * (max_row_len - len(row)))
+
+ return table
+
+ def _get_row_repeat(self, row) -> int:
+ """Return number of times this row was repeated
+ Repeating an empty row appeared to be a common way
+ of representing sparse rows in the table.
+ """
+ from odf.namespaces import TABLENS
+
+ return int(row.attributes.get((TABLENS, 'number-rows-repeated'), 1))
+
+ def _get_column_repeat(self, cell) -> int:
+ from odf.namespaces import TABLENS
+ return int(cell.attributes.get(
+ (TABLENS, 'number-columns-repeated'), 1))
+
+ def _is_empty_row(self, row) -> bool:
+ """Helper function to find empty rows
+ """
+ for column in row.childNodes:
+ if len(column.childNodes) > 0:
+ return False
+
+ return True
+
+ def _get_cell_value(self, cell, convert_float: bool) -> Scalar:
+ from odf.namespaces import OFFICENS
+ cell_type = cell.attributes.get((OFFICENS, 'value-type'))
+ if cell_type == 'boolean':
+ if str(cell) == "TRUE":
+ return True
+ return False
+ if cell_type is None:
+ return self.empty_value
+ elif cell_type == 'float':
+ # GH5394
+ cell_value = float(cell.attributes.get((OFFICENS, 'value')))
+
+ if cell_value == 0. and str(cell) != cell_value: # NA handling
+ return str(cell)
+
+ if convert_float:
+ val = int(cell_value)
+ if val == cell_value:
+ return val
+ return cell_value
+ elif cell_type == 'percentage':
+ cell_value = cell.attributes.get((OFFICENS, 'value'))
+ return float(cell_value)
+ elif cell_type == 'string':
+ return str(cell)
+ elif cell_type == 'currency':
+ cell_value = cell.attributes.get((OFFICENS, 'value'))
+ return float(cell_value)
+ elif cell_type == 'date':
+ cell_value = cell.attributes.get((OFFICENS, 'date-value'))
+ return pd.to_datetime(cell_value)
+ elif cell_type == 'time':
+ return pd.to_datetime(str(cell)).time()
+ else:
+ raise ValueError('Unrecognized type {}'.format(cell_type))
diff --git a/pandas/tests/io/data/blank.ods b/pandas/tests/io/data/blank.ods
diff --git a/pandas/tests/io/data/blank_with_header.ods b/pandas/tests/io/data/blank_with_header.ods
diff --git a/pandas/tests/io/data/invalid_value_type.ods b/pandas/tests/io/data/invalid_value_type.ods
diff --git a/pandas/tests/io/data/test1.ods b/pandas/tests/io/data/test1.ods
diff --git a/pandas/tests/io/data/test2.ods b/pandas/tests/io/data/test2.ods
diff --git a/pandas/tests/io/data/test3.ods b/pandas/tests/io/data/test3.ods
diff --git a/pandas/tests/io/data/test4.ods b/pandas/tests/io/data/test4.ods
diff --git a/pandas/tests/io/data/test5.ods b/pandas/tests/io/data/test5.ods
diff --git a/pandas/tests/io/data/test_converters.ods b/pandas/tests/io/data/test_converters.ods
diff --git a/pandas/tests/io/data/test_index_name_pre17.ods b/pandas/tests/io/data/test_index_name_pre17.ods
diff --git a/pandas/tests/io/data/test_multisheet.ods b/pandas/tests/io/data/test_multisheet.ods
diff --git a/pandas/tests/io/data/test_squeeze.ods b/pandas/tests/io/data/test_squeeze.ods
diff --git a/pandas/tests/io/data/test_types.ods b/pandas/tests/io/data/test_types.ods
diff --git a/pandas/tests/io/data/testdateoverflow.ods b/pandas/tests/io/data/testdateoverflow.ods
diff --git a/pandas/tests/io/data/testdtype.ods b/pandas/tests/io/data/testdtype.ods
diff --git a/pandas/tests/io/data/testmultiindex.ods b/pandas/tests/io/data/testmultiindex.ods
diff --git a/pandas/tests/io/data/testskiprows.ods b/pandas/tests/io/data/testskiprows.ods
diff --git a/pandas/tests/io/data/times_1900.ods b/pandas/tests/io/data/times_1900.ods
diff --git a/pandas/tests/io/data/times_1904.ods b/pandas/tests/io/data/times_1904.ods
diff --git a/pandas/tests/io/data/writertable.odt b/pandas/tests/io/data/writertable.odt
diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py
@@ -30,7 +30,7 @@ def df_ref():
  return df_ref
 
 
-@pytest.fixture(params=['.xls', '.xlsx', '.xlsm'])
+@pytest.fixture(params=['.xls', '.xlsx', '.xlsm', '.ods'])
 def read_ext(request):
  """
  Valid extensions for reading Excel files.

diff --git a/pandas/tests/io/excel/test_odf.py b/pandas/tests/io/excel/test_odf.py
@@ -0,0 +1,39 @@
+import functools
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+
+pytest.importorskip("odf")
+
+
+@pytest.fixture(autouse=True)
+def cd_and_set_engine(monkeypatch, datapath):
+ func = functools.partial(pd.read_excel, engine="odf")
+ monkeypatch.setattr(pd, 'read_excel', func)
+ monkeypatch.chdir(datapath("io", "data"))
+
+
+def test_read_invalid_types_raises():
+ # the invalid_value_type.ods required manually editing
+ # of the included content.xml file
+ with pytest.raises(ValueError,
+ match="Unrecognized type awesome_new_type"):
+ pd.read_excel("invalid_value_type.ods")
+
+
+def test_read_writer_table():
+ # Also test reading tables from an text OpenDocument file
+ # (.odt)
+ index = pd.Index(["Row 1", "Row 2", "Row 3"], name="Header")
+ expected = pd.DataFrame([
+ [1, np.nan, 7],
+ [2, np.nan, 8],
+ [3, np.nan, 9],
+ ], index=index, columns=["Column 1", "Unnamed: 2", "Column 3"])
+
+ result = pd.read_excel("writertable.odt", 'Table1', index_col=0)
+
+ tm.assert_frame_equal(result, expected)