Skip to content

Commit ce4eb01

Browse files
committed
implement first version of openpyxl reader
1 parent e0199a8 commit ce4eb01

File tree

1 file changed

+206
-2
lines changed

1 file changed

+206
-2
lines changed

pandas/io/excel.py

Lines changed: 206 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
_NA_VALUES, _is_url, _stringify_path, _urlopen, _validate_header_arg,
3232
get_filepath_or_buffer)
3333
from pandas.io.formats.printing import pprint_thing
34-
from pandas.io.parsers import TextParser
34+
from pandas.io.parsers import TextParser, _validate_usecols_names, _validate_usecols_arg
3535

3636
__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
3737

@@ -449,7 +449,7 @@ def parse(self,
449449
data = self.get_sheet_data(sheet, convert_float)
450450
usecols = _maybe_convert_usecols(usecols)
451451

452-
if sheet.nrows == 0:
452+
if not data:
453453
output[asheetname] = DataFrame()
454454
continue
455455

@@ -651,6 +651,209 @@ def _parse_cell(cell_contents, cell_typ):
651651
return data
652652

653653

654+
class _OpenpyxlReader(_BaseExcelReader):
655+
656+
def __init__(self, filepath_or_buffer):
657+
"""Reader using openpyxl engine.
658+
659+
Parameters
660+
----------
661+
filepath_or_buffer : string, path object or Workbook
662+
Object to be parsed.
663+
"""
664+
err_msg = "Install xlrd >= 1.0.0 for Excel support"
665+
666+
try:
667+
import openpyxl
668+
except ImportError:
669+
raise ImportError(err_msg)
670+
671+
# If filepath_or_buffer is a url, want to keep the data as bytes so
672+
# can't pass to get_filepath_or_buffer()
673+
if _is_url(filepath_or_buffer):
674+
filepath_or_buffer = _urlopen(filepath_or_buffer)
675+
elif not isinstance(filepath_or_buffer, (ExcelFile, openpyxl.Workbook)):
676+
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
677+
filepath_or_buffer)
678+
679+
if isinstance(filepath_or_buffer, openpyxl.Workbook):
680+
self.book = filepath_or_buffer
681+
elif hasattr(filepath_or_buffer, "read"):
682+
if hasattr(filepath_or_buffer, 'seek'):
683+
try:
684+
# GH 19779
685+
filepath_or_buffer.seek(0)
686+
except UnsupportedOperation:
687+
# HTTPResponse does not support seek()
688+
# GH 20434
689+
pass
690+
691+
data = filepath_or_buffer.read()
692+
self.book = openpyxl.load_workbook(
693+
filepath_or_buffer, data_only=True)
694+
elif isinstance(filepath_or_buffer, compat.string_types):
695+
self.book = openpyxl.load_workbook(
696+
filepath_or_buffer, data_only=True)
697+
else:
698+
raise ValueError('Must explicitly set engine if not passing in'
699+
' buffer or path for io.')
700+
701+
@property
702+
def sheet_names(self):
703+
return self.book.sheetnames
704+
705+
def get_sheet_by_name(self, name):
706+
return self.book[name]
707+
708+
def get_sheet_by_index(self, index):
709+
return self.book.worksheets[index]
710+
711+
@staticmethod
712+
def _replace_type_error_with_nan(rows):
713+
nan = float('nan')
714+
for row in rows:
715+
yield [nan if cell.data_type == cell.TYPE_ERROR else cell.value for cell in row]
716+
717+
def get_sheet_data(self, sheet, convert_float):
718+
data = self._replace_type_error_with_nan(sheet.rows)
719+
# TODO: support using iterator
720+
# TODO: don't make strings out of data
721+
return list(data)
722+
723+
def parse(self,
724+
sheet_name=0,
725+
header=0,
726+
names=None,
727+
index_col=None,
728+
usecols=None,
729+
squeeze=False,
730+
dtype=None,
731+
true_values=None,
732+
false_values=None,
733+
skiprows=None,
734+
nrows=None,
735+
na_values=None,
736+
verbose=False,
737+
parse_dates=False,
738+
date_parser=None,
739+
thousands=None,
740+
comment=None,
741+
skipfooter=0,
742+
convert_float=True,
743+
mangle_dupe_cols=True,
744+
**kwds):
745+
746+
_validate_header_arg(header)
747+
748+
ret_dict = False
749+
750+
# Keep sheetname to maintain backwards compatibility.
751+
if isinstance(sheet_name, list):
752+
sheets = sheet_name
753+
ret_dict = True
754+
elif sheet_name is None:
755+
sheets = self.sheet_names
756+
ret_dict = True
757+
else:
758+
sheets = [sheet_name]
759+
760+
# handle same-type duplicates.
761+
sheets = list(OrderedDict.fromkeys(sheets).keys())
762+
763+
output = OrderedDict()
764+
765+
for asheetname in sheets:
766+
if verbose:
767+
print("Reading sheet {sheet}".format(sheet=asheetname))
768+
769+
if isinstance(asheetname, compat.string_types):
770+
sheet = self.get_sheet_by_name(asheetname)
771+
else: # assume an integer if not a string
772+
sheet = self.get_sheet_by_index(asheetname)
773+
774+
data = self.get_sheet_data(sheet, convert_float)
775+
usecols = _maybe_convert_usecols(usecols)
776+
777+
if not data:
778+
output[asheetname] = DataFrame()
779+
continue
780+
781+
if is_list_like(header) and len(header) == 1:
782+
header = header[0]
783+
784+
# forward fill and pull out names for MultiIndex column
785+
header_names = None
786+
if header is not None and is_list_like(header):
787+
header_names = []
788+
control_row = [True] * len(data[0])
789+
790+
for row in header:
791+
if is_integer(skiprows):
792+
row += skiprows
793+
794+
data[row], control_row = _fill_mi_header(data[row],
795+
control_row)
796+
797+
if index_col is not None:
798+
header_name, _ = _pop_header_name(data[row], index_col)
799+
header_names.append(header_name)
800+
801+
has_index_names = is_list_like(header) and len(header) > 1
802+
803+
if skiprows:
804+
data = [row for i, row in enumerate(data) if i not in skiprows]
805+
806+
column_names = [cell for i, cell in enumerate(data.pop(0))]
807+
808+
frame = DataFrame(data, columns=column_names)
809+
if usecols:
810+
_validate_usecols_arg(usecols)
811+
usecols = sorted(usecols)
812+
if any(isinstance(i, str) for i in usecols):
813+
_validate_usecols_names(usecols, column_names)
814+
frame = frame[usecols]
815+
else:
816+
frame = frame.iloc[:, usecols]
817+
818+
if index_col is not None:
819+
if is_list_like(index_col):
820+
if any(isinstance(i, str) for i in index_col):
821+
frame = frame.set_index(index_col)
822+
if len(index_col) == 1:
823+
# TODO: understand why this is needed
824+
raise TypeError(
825+
"list indices must be integers.*, not str")
826+
else:
827+
frame = frame.set_index(
828+
[column_names[i] for i in index_col])
829+
else:
830+
if isinstance(index_col, str):
831+
frame = frame.set_index(index_col)
832+
else:
833+
frame = frame.set_index(column_names[index_col])
834+
835+
output[asheetname] = frame
836+
if not squeeze or isinstance(output[asheetname], DataFrame):
837+
if header_names:
838+
output[asheetname].columns = output[
839+
asheetname].columns.set_names(header_names)
840+
elif compat.PY2:
841+
output[asheetname].columns = _maybe_convert_to_string(
842+
output[asheetname].columns)
843+
844+
# name unnamed columns
845+
unnamed = 0
846+
for i, col_name in enumerate(frame.columns.values):
847+
if col_name is None:
848+
frame.columns.values[i] = "Unnamed: {n}".format(n=unnamed)
849+
unnamed += 1
850+
851+
if ret_dict:
852+
return output
853+
else:
854+
return output[asheetname]
855+
856+
654857
class ExcelFile(object):
655858
"""
656859
Class for parsing tabular excel sheets into DataFrame objects.
@@ -668,6 +871,7 @@ class ExcelFile(object):
668871

669872
_engines = {
670873
'xlrd': _XlrdReader,
874+
'openpyxl': _OpenpyxlReader,
671875
}
672876

673877
def __init__(self, io, engine=None):

0 commit comments

Comments
 (0)