3131 _NA_VALUES , _is_url , _stringify_path , _urlopen , _validate_header_arg ,
3232 get_filepath_or_buffer )
3333from pandas .io .formats .printing import pprint_thing
34- from pandas .io .parsers import TextParser
34+ from pandas .io .parsers import TextParser , _validate_usecols_names , _validate_usecols_arg
3535
3636__all__ = ["read_excel" , "ExcelWriter" , "ExcelFile" ]
3737
@@ -449,7 +449,7 @@ def parse(self,
449449 data = self .get_sheet_data (sheet , convert_float )
450450 usecols = _maybe_convert_usecols (usecols )
451451
452- if sheet . nrows == 0 :
452+ if not data :
453453 output [asheetname ] = DataFrame ()
454454 continue
455455
@@ -651,6 +651,209 @@ def _parse_cell(cell_contents, cell_typ):
651651 return data
652652
653653
654+ class _OpenpyxlReader (_BaseExcelReader ):
655+
656+ def __init__ (self , filepath_or_buffer ):
657+ """Reader using openpyxl engine.
658+
659+ Parameters
660+ ----------
661+ filepath_or_buffer : string, path object or Workbook
662+ Object to be parsed.
663+ """
664+ err_msg = "Install xlrd >= 1.0.0 for Excel support"
665+
666+ try :
667+ import openpyxl
668+ except ImportError :
669+ raise ImportError (err_msg )
670+
671+ # If filepath_or_buffer is a url, want to keep the data as bytes so
672+ # can't pass to get_filepath_or_buffer()
673+ if _is_url (filepath_or_buffer ):
674+ filepath_or_buffer = _urlopen (filepath_or_buffer )
675+ elif not isinstance (filepath_or_buffer , (ExcelFile , openpyxl .Workbook )):
676+ filepath_or_buffer , _ , _ , _ = get_filepath_or_buffer (
677+ filepath_or_buffer )
678+
679+ if isinstance (filepath_or_buffer , openpyxl .Workbook ):
680+ self .book = filepath_or_buffer
681+ elif hasattr (filepath_or_buffer , "read" ):
682+ if hasattr (filepath_or_buffer , 'seek' ):
683+ try :
684+ # GH 19779
685+ filepath_or_buffer .seek (0 )
686+ except UnsupportedOperation :
687+ # HTTPResponse does not support seek()
688+ # GH 20434
689+ pass
690+
691+ data = filepath_or_buffer .read ()
692+ self .book = openpyxl .load_workbook (
693+ filepath_or_buffer , data_only = True )
694+ elif isinstance (filepath_or_buffer , compat .string_types ):
695+ self .book = openpyxl .load_workbook (
696+ filepath_or_buffer , data_only = True )
697+ else :
698+ raise ValueError ('Must explicitly set engine if not passing in'
699+ ' buffer or path for io.' )
700+
701+ @property
702+ def sheet_names (self ):
703+ return self .book .sheetnames
704+
705+ def get_sheet_by_name (self , name ):
706+ return self .book [name ]
707+
708+ def get_sheet_by_index (self , index ):
709+ return self .book .worksheets [index ]
710+
711+ @staticmethod
712+ def _replace_type_error_with_nan (rows ):
713+ nan = float ('nan' )
714+ for row in rows :
715+ yield [nan if cell .data_type == cell .TYPE_ERROR else cell .value for cell in row ]
716+
717+ def get_sheet_data (self , sheet , convert_float ):
718+ data = self ._replace_type_error_with_nan (sheet .rows )
719+ # TODO: support using iterator
720+ # TODO: don't make strings out of data
721+ return list (data )
722+
723+ def parse (self ,
724+ sheet_name = 0 ,
725+ header = 0 ,
726+ names = None ,
727+ index_col = None ,
728+ usecols = None ,
729+ squeeze = False ,
730+ dtype = None ,
731+ true_values = None ,
732+ false_values = None ,
733+ skiprows = None ,
734+ nrows = None ,
735+ na_values = None ,
736+ verbose = False ,
737+ parse_dates = False ,
738+ date_parser = None ,
739+ thousands = None ,
740+ comment = None ,
741+ skipfooter = 0 ,
742+ convert_float = True ,
743+ mangle_dupe_cols = True ,
744+ ** kwds ):
745+
746+ _validate_header_arg (header )
747+
748+ ret_dict = False
749+
750+ # Keep sheetname to maintain backwards compatibility.
751+ if isinstance (sheet_name , list ):
752+ sheets = sheet_name
753+ ret_dict = True
754+ elif sheet_name is None :
755+ sheets = self .sheet_names
756+ ret_dict = True
757+ else :
758+ sheets = [sheet_name ]
759+
760+ # handle same-type duplicates.
761+ sheets = list (OrderedDict .fromkeys (sheets ).keys ())
762+
763+ output = OrderedDict ()
764+
765+ for asheetname in sheets :
766+ if verbose :
767+ print ("Reading sheet {sheet}" .format (sheet = asheetname ))
768+
769+ if isinstance (asheetname , compat .string_types ):
770+ sheet = self .get_sheet_by_name (asheetname )
771+ else : # assume an integer if not a string
772+ sheet = self .get_sheet_by_index (asheetname )
773+
774+ data = self .get_sheet_data (sheet , convert_float )
775+ usecols = _maybe_convert_usecols (usecols )
776+
777+ if not data :
778+ output [asheetname ] = DataFrame ()
779+ continue
780+
781+ if is_list_like (header ) and len (header ) == 1 :
782+ header = header [0 ]
783+
784+ # forward fill and pull out names for MultiIndex column
785+ header_names = None
786+ if header is not None and is_list_like (header ):
787+ header_names = []
788+ control_row = [True ] * len (data [0 ])
789+
790+ for row in header :
791+ if is_integer (skiprows ):
792+ row += skiprows
793+
794+ data [row ], control_row = _fill_mi_header (data [row ],
795+ control_row )
796+
797+ if index_col is not None :
798+ header_name , _ = _pop_header_name (data [row ], index_col )
799+ header_names .append (header_name )
800+
801+ has_index_names = is_list_like (header ) and len (header ) > 1
802+
803+ if skiprows :
804+ data = [row for i , row in enumerate (data ) if i not in skiprows ]
805+
806+ column_names = [cell for i , cell in enumerate (data .pop (0 ))]
807+
808+ frame = DataFrame (data , columns = column_names )
809+ if usecols :
810+ _validate_usecols_arg (usecols )
811+ usecols = sorted (usecols )
812+ if any (isinstance (i , str ) for i in usecols ):
813+ _validate_usecols_names (usecols , column_names )
814+ frame = frame [usecols ]
815+ else :
816+ frame = frame .iloc [:, usecols ]
817+
818+ if index_col is not None :
819+ if is_list_like (index_col ):
820+ if any (isinstance (i , str ) for i in index_col ):
821+ frame = frame .set_index (index_col )
822+ if len (index_col ) == 1 :
823+ # TODO: understand why this is needed
824+ raise TypeError (
825+ "list indices must be integers.*, not str" )
826+ else :
827+ frame = frame .set_index (
828+ [column_names [i ] for i in index_col ])
829+ else :
830+ if isinstance (index_col , str ):
831+ frame = frame .set_index (index_col )
832+ else :
833+ frame = frame .set_index (column_names [index_col ])
834+
835+ output [asheetname ] = frame
836+ if not squeeze or isinstance (output [asheetname ], DataFrame ):
837+ if header_names :
838+ output [asheetname ].columns = output [
839+ asheetname ].columns .set_names (header_names )
840+ elif compat .PY2 :
841+ output [asheetname ].columns = _maybe_convert_to_string (
842+ output [asheetname ].columns )
843+
844+ # name unnamed columns
845+ unnamed = 0
846+ for i , col_name in enumerate (frame .columns .values ):
847+ if col_name is None :
848+ frame .columns .values [i ] = "Unnamed: {n}" .format (n = unnamed )
849+ unnamed += 1
850+
851+ if ret_dict :
852+ return output
853+ else :
854+ return output [asheetname ]
855+
856+
654857class ExcelFile (object ):
655858 """
656859 Class for parsing tabular excel sheets into DataFrame objects.
@@ -668,6 +871,7 @@ class ExcelFile(object):
668871
669872 _engines = {
670873 'xlrd' : _XlrdReader ,
874+ 'openpyxl' : _OpenpyxlReader ,
671875 }
672876
673877 def __init__ (self , io , engine = None ):
0 commit comments