Skip to content

Commit 1be0561

Browse files
tdamsmaWillAyd
authored andcommitted
Openpyxl engine for reading excel files (#25092)
1 parent be4b48e commit 1be0561

File tree

8 files changed

+125
-4
lines changed

8 files changed

+125
-4
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ Other enhancements
159159
- Added support for reading SPSS .sav files using :func:`read_spss` (:issue:`26537`)
160160
- Added new option ``plotting.backend`` to be able to select a plotting backend different than the existing ``matplotlib`` one. Use ``pandas.set_option('plotting.backend', '<backend-module>')`` where ``<backend-module`` is a library implementing the pandas plotting API (:issue:`14130`)
161161
- :class:`pandas.offsets.BusinessHour` supports multiple opening hours intervals (:issue:`15481`)
162+
- :func:`read_excel` can now use ``openpyxl`` to read Excel files via the ``engine='openpyxl'`` argument. This will become the default in a future release (:issue:`11499`)
162163

163164
.. _whatsnew_0250.api_breaking:
164165

pandas/_typing.py

+1
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,4 @@
2424
FilePathOrBuffer = Union[str, Path, IO[AnyStr]]
2525

2626
FrameOrSeries = TypeVar('FrameOrSeries', ABCSeries, ABCDataFrame)
27+
Scalar = Union[str, int, float]

pandas/core/config_init.py

+37-1
Original file line numberDiff line numberDiff line change
@@ -411,7 +411,43 @@ def use_inf_as_na_cb(key):
411411
cf.register_option('chained_assignment', 'warn', chained_assignment,
412412
validator=is_one_of_factory([None, 'warn', 'raise']))
413413

414-
# Set up the io.excel specific configuration.
414+
415+
# Set up the io.excel specific reader configuration.
416+
reader_engine_doc = """
417+
: string
418+
The default Excel reader engine for '{ext}' files. Available options:
419+
auto, {others}.
420+
"""
421+
422+
_xls_options = ['xlrd']
423+
_xlsm_options = ['xlrd', 'openpyxl']
424+
_xlsx_options = ['xlrd', 'openpyxl']
425+
426+
427+
with cf.config_prefix("io.excel.xls"):
428+
cf.register_option("reader", "auto",
429+
reader_engine_doc.format(
430+
ext='xls',
431+
others=', '.join(_xls_options)),
432+
validator=str)
433+
434+
with cf.config_prefix("io.excel.xlsm"):
435+
cf.register_option("reader", "auto",
436+
reader_engine_doc.format(
437+
ext='xlsm',
438+
others=', '.join(_xlsm_options)),
439+
validator=str)
440+
441+
442+
with cf.config_prefix("io.excel.xlsx"):
443+
cf.register_option("reader", "auto",
444+
reader_engine_doc.format(
445+
ext='xlsx',
446+
others=', '.join(_xlsx_options)),
447+
validator=str)
448+
449+
450+
# Set up the io.excel specific writer configuration.
415451
writer_engine_doc = """
416452
: string
417453
The default Excel writer engine for '{ext}' files. Available options:

pandas/io/excel/_base.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ def parse(self,
422422
data = self.get_sheet_data(sheet, convert_float)
423423
usecols = _maybe_convert_usecols(usecols)
424424

425-
if sheet.nrows == 0:
425+
if not data:
426426
output[asheetname] = DataFrame()
427427
continue
428428

@@ -769,9 +769,11 @@ class ExcelFile:
769769
"""
770770

771771
from pandas.io.excel._xlrd import _XlrdReader
772+
from pandas.io.excel._openpyxl import _OpenpyxlReader
772773

773774
_engines = {
774775
'xlrd': _XlrdReader,
776+
'openpyxl': _OpenpyxlReader,
775777
}
776778

777779
def __init__(self, io, engine=None):

pandas/io/excel/_openpyxl.py

+73-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
1-
from pandas.io.excel._base import ExcelWriter
1+
from typing import List
2+
3+
import numpy as np
4+
5+
from pandas.compat._optional import import_optional_dependency
6+
7+
from pandas._typing import FilePathOrBuffer, Scalar
8+
9+
from pandas.io.excel._base import ExcelWriter, _BaseExcelReader
210
from pandas.io.excel._util import _validate_freeze_panes
311

412

@@ -451,3 +459,67 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0,
451459
xcell = wks.cell(column=col, row=row)
452460
for k, v in style_kwargs.items():
453461
setattr(xcell, k, v)
462+
463+
464+
class _OpenpyxlReader(_BaseExcelReader):
465+
466+
def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None:
467+
"""Reader using openpyxl engine.
468+
469+
Parameters
470+
----------
471+
filepath_or_buffer : string, path object or Workbook
472+
Object to be parsed.
473+
"""
474+
import_optional_dependency("openpyxl")
475+
super().__init__(filepath_or_buffer)
476+
477+
@property
478+
def _workbook_class(self):
479+
from openpyxl import Workbook
480+
return Workbook
481+
482+
def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
483+
from openpyxl import load_workbook
484+
return load_workbook(filepath_or_buffer,
485+
read_only=True, data_only=True)
486+
487+
@property
488+
def sheet_names(self) -> List[str]:
489+
return self.book.sheetnames
490+
491+
def get_sheet_by_name(self, name: str):
492+
return self.book[name]
493+
494+
def get_sheet_by_index(self, index: int):
495+
return self.book.worksheets[index]
496+
497+
def _convert_cell(self, cell, convert_float: bool) -> Scalar:
498+
499+
# TODO: replace with openpyxl constants
500+
if cell.is_date:
501+
return cell.value
502+
elif cell.data_type == 'e':
503+
return np.nan
504+
elif cell.data_type == 'b':
505+
return bool(cell.value)
506+
elif cell.value is None:
507+
return '' # compat with xlrd
508+
elif cell.data_type == 'n':
509+
# GH5394
510+
if convert_float:
511+
val = int(cell.value)
512+
if val == cell.value:
513+
return val
514+
else:
515+
return float(cell.value)
516+
517+
return cell.value
518+
519+
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
520+
data = [] # type: List[List[Scalar]]
521+
for row in sheet.rows:
522+
data.append(
523+
[self._convert_cell(cell, convert_float) for cell in row])
524+
525+
return data

pandas/tests/io/data/test1.xlsm

-1.83 KB
Binary file not shown.

pandas/tests/io/data/test1.xlsx

-1.76 KB
Binary file not shown.

pandas/tests/io/excel/test_readers.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,17 @@ class TestReaders:
3838
# Add any engines to test here
3939
pytest.param('xlrd', marks=pytest.mark.skipif(
4040
not td.safe_import("xlrd"), reason="no xlrd")),
41+
pytest.param('openpyxl', marks=pytest.mark.skipif(
42+
not td.safe_import("openpyxl"), reason="no openpyxl")),
4143
pytest.param(None, marks=pytest.mark.skipif(
4244
not td.safe_import("xlrd"), reason="no xlrd")),
4345
])
44-
def cd_and_set_engine(self, request, datapath, monkeypatch):
46+
def cd_and_set_engine(self, request, datapath, monkeypatch, read_ext):
4547
"""
4648
Change directory and set engine for read_excel calls.
4749
"""
50+
if request.param == 'openpyxl' and read_ext == '.xls':
51+
pytest.skip()
4852
func = partial(pd.read_excel, engine=request.param)
4953
monkeypatch.chdir(datapath("io", "data"))
5054
monkeypatch.setattr(pd, 'read_excel', func)
@@ -397,6 +401,9 @@ def test_date_conversion_overflow(self, read_ext):
397401
[1e+20, 'Timothy Brown']],
398402
columns=['DateColWithBigInt', 'StringCol'])
399403

404+
if pd.read_excel.keywords['engine'] == 'openpyxl':
405+
pytest.xfail("Maybe not supported by openpyxl")
406+
400407
result = pd.read_excel('testdateoverflow' + read_ext)
401408
tm.assert_frame_equal(result, expected)
402409

@@ -724,6 +731,8 @@ class TestExcelFileRead:
724731
# Add any engines to test here
725732
pytest.param('xlrd', marks=pytest.mark.skipif(
726733
not td.safe_import("xlrd"), reason="no xlrd")),
734+
pytest.param('openpyxl', marks=pytest.mark.skipif(
735+
not td.safe_import("openpyxl"), reason="no openpyxl")),
727736
pytest.param(None, marks=pytest.mark.skipif(
728737
not td.safe_import("xlrd"), reason="no xlrd")),
729738
])

0 commit comments

Comments
 (0)