pandas-dev · simonjayhawkins · Jan 21, 2020 · Jan 20, 2020
diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml
@@ -34,3 +34,6 @@ dependencies:
   - xlsxwriter
   - xlwt
   - pyarrow>=0.15
+  - pip
+  - pip:
+    - pyxlsb
diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml
@@ -33,3 +33,4 @@ dependencies:
   - pip
   - pip:
     - pyreadstat
+    - pyxlsb
diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml
@@ -35,3 +35,6 @@ dependencies:
   - xlsxwriter
   - xlwt
   - pyreadstat
+  - pip
+  - pip:
+    - pyxlsb
diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml
@@ -51,3 +51,4 @@ dependencies:
     - coverage
     - pandas-datareader
     - python-dateutil
+    - pyxlsb
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -264,6 +264,7 @@ pyarrow                   0.12.0             Parquet, ORC (requires 0.13.0), and
 pymysql                   0.7.11             MySQL engine for sqlalchemy
 pyreadstat                                   SPSS files (.sav) reading
 pytables                  3.4.2              HDF5 reading / writing
+pyxlsb                    1.0.5              Reading for xlsb files
 qtpy                                         Clipboard I/O
 s3fs                      0.3.0              Amazon S3 access
 tabulate                  0.8.3              Printing in Markdown-friendly format (see `tabulate`_)

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -23,7 +23,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
     text;`JSON <https://www.json.org/>`__;:ref:`read_json<io.json_reader>`;:ref:`to_json<io.json_writer>`
     text;`HTML <https://en.wikipedia.org/wiki/HTML>`__;:ref:`read_html<io.read_html>`;:ref:`to_html<io.html>`
     text; Local clipboard;:ref:`read_clipboard<io.clipboard>`;:ref:`to_clipboard<io.clipboard>`
-    binary;`MS Excel <https://en.wikipedia.org/wiki/Microsoft_Excel>`__;:ref:`read_excel<io.excel_reader>`;:ref:`to_excel<io.excel_writer>`
+    ;`MS Excel <https://en.wikipedia.org/wiki/Microsoft_Excel>`__;:ref:`read_excel<io.excel_reader>`;:ref:`to_excel<io.excel_writer>`
     binary;`OpenDocument <http://www.opendocumentformat.org>`__;:ref:`read_excel<io.ods>`;
     binary;`HDF5 Format <https://support.hdfgroup.org/HDF5/whatishdf5.html>`__;:ref:`read_hdf<io.hdf5>`;:ref:`to_hdf<io.hdf5>`
     binary;`Feather Format <https://github.com/wesm/feather>`__;:ref:`read_feather<io.feather>`;:ref:`to_feather<io.feather>`
@@ -2768,7 +2768,8 @@ Excel files
 
 The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``)
 files using the ``xlrd`` Python module.  Excel 2007+ (``.xlsx``) files
-can be read using either ``xlrd`` or ``openpyxl``.
+can be read using either ``xlrd`` or ``openpyxl``. Binary Excel (``.xlsb``)
+files can be read using ``pyxlsb``.
 The :meth:`~DataFrame.to_excel` instance method is used for
 saving a ``DataFrame`` to Excel.  Generally the semantics are
 similar to working with :ref:`csv<io.read_csv_table>` data.
@@ -3229,6 +3230,30 @@ OpenDocument spreadsheets match what can be done for `Excel files`_ using
    Currently pandas only supports *reading* OpenDocument spreadsheets. Writing
    is not implemented.
 
+.. _io.xlsb:
+
+Binary Excel (.xlsb) files
+--------------------------
+
+.. versionadded:: 1.0.0
+
+The :func:`~pandas.read_excel` method can also read binary Excel files
+using the ``pyxlsb`` module. The semantics and features for reading
+binary Excel files mostly match what can be done for `Excel files`_ using
+``engine='pyxlsb'``. ``pyxlsb`` does not recognize datetime types
+in files and will return floats instead.
+
+.. code-block:: python
+
+   # Returns a DataFrame
+   pd.read_excel('path_to_file.xlsb', engine='pyxlsb')
+
+.. note::
+
+   Currently pandas only supports *reading* binary Excel files. Writing
+   is not implemented.
+
+
 .. _io.clipboard:
 
 Clipboard

diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst
@@ -215,7 +215,8 @@ Other enhancements
 - :meth:`Styler.format` added the ``na_rep`` parameter to help format the missing values (:issue:`21527`, :issue:`28358`)
 - Roundtripping DataFrames with nullable integer, string and period data types to parquet
   (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine
-  now preserve those data types with pyarrow >= 0.16.0 (:issue:`20612`, :issue:`28371`).
+  now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`).
+- :func:`read_excel` now can read binary Excel (``.xlsb``) files by passing ``engine='pyxlsb'``. For more details and example usage, see the :ref:`Binary Excel files documentation <io.xlsb>`. Closes :issue:`8540`.
 - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`)
 - :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`)
 - :func:`to_parquet` now appropriately handles the ``schema`` argument for user defined schemas in the pyarrow engine. (:issue:`30270`)

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -19,6 +19,7 @@
     "pyarrow": "0.13.0",
     "pytables": "3.4.2",
     "pytest": "5.0.1",
+    "pyxlsb": "1.0.5",
     "s3fs": "0.3.0",
     "scipy": "0.19.0",
     "sqlalchemy": "1.1.4",

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
@@ -479,6 +479,7 @@ def use_inf_as_na_cb(key):
 _xlsm_options = ["xlrd", "openpyxl"]
 _xlsx_options = ["xlrd", "openpyxl"]
 _ods_options = ["odf"]
+_xlsb_options = ["pyxlsb"]
 
 
 with cf.config_prefix("io.excel.xls"):
@@ -515,6 +516,13 @@ def use_inf_as_na_cb(key):
         validator=str,
     )
 
+with cf.config_prefix("io.excel.xlsb"):
+    cf.register_option(
+        "reader",
+        "auto",
+        reader_engine_doc.format(ext="xlsb", others=", ".join(_xlsb_options)),
+        validator=str,
+    )
 
 # Set up the io.excel specific writer configuration.
 writer_engine_doc = """

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -35,8 +35,9 @@
     """
 Read an Excel file into a pandas DataFrame.
 
-Support both `xls` and `xlsx` file extensions from a local filesystem or URL.
-Support an option to read a single sheet or a list of sheets.
+Supports `xls`, `xlsx`, `xlsm`, `xlsb`, and `odf` file extensions
+read from a local filesystem or URL. Supports an option to read
+a single sheet or a list of sheets.
 
 Parameters
 ----------
@@ -789,15 +790,21 @@ class ExcelFile:
         If a string or path object, expected to be a path to xls, xlsx or odf file.
     engine : str, default None
         If io is not a buffer or path, this must be set to identify io.
-        Acceptable values are None, ``xlrd``, ``openpyxl`` or ``odf``.
+        Acceptable values are None, ``xlrd``, ``openpyxl``,  ``odf``, or ``pyxlsb``.
         Note that ``odf`` reads tables out of OpenDocument formatted files.
     """
 
     from pandas.io.excel._odfreader import _ODFReader
     from pandas.io.excel._openpyxl import _OpenpyxlReader
     from pandas.io.excel._xlrd import _XlrdReader
-
-    _engines = {"xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader}
+    from pandas.io.excel._pyxlsb import _PyxlsbReader
+
+    _engines = {
+        "xlrd": _XlrdReader,
+        "openpyxl": _OpenpyxlReader,
+        "odf": _ODFReader,
+        "pyxlsb": _PyxlsbReader,
+    }
 
     def __init__(self, io, engine=None):
         if engine is None:

diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py
@@ -0,0 +1,68 @@
+from typing import List
+
+from pandas._typing import FilePathOrBuffer, Scalar
+from pandas.compat._optional import import_optional_dependency
+
+from pandas.io.excel._base import _BaseExcelReader
+
+
+class _PyxlsbReader(_BaseExcelReader):
+    def __init__(self, filepath_or_buffer: FilePathOrBuffer):
+        """Reader using pyxlsb engine.
+
+        Parameters
+        __________
+        filepath_or_buffer: string, path object, or Workbook
+            Object to be parsed.
+        """
+        import_optional_dependency("pyxlsb")
+        # This will call load_workbook on the filepath or buffer
+        # And set the result to the book-attribute
+        super().__init__(filepath_or_buffer)
+
+    @property
+    def _workbook_class(self):
+        from pyxlsb import Workbook
+
+        return Workbook
+
+    def load_workbook(self, filepath_or_buffer: FilePathOrBuffer):
+        from pyxlsb import open_workbook
+
+        # Todo: hack in buffer capability
+        # This might need some modifications to the Pyxlsb library
+        # Actual work for opening it is in xlsbpackage.py, line 20-ish
+
+        return open_workbook(filepath_or_buffer)
+
+    @property
+    def sheet_names(self) -> List[str]:
+        return self.book.sheets
+
+    def get_sheet_by_name(self, name: str):
+        return self.book.get_sheet(name)
+
+    def get_sheet_by_index(self, index: int):
+        # pyxlsb sheets are indexed from 1 onwards
+        # There's a fix for this in the source, but the pypi package doesn't have it
+        return self.book.get_sheet(index + 1)
+
+    def _convert_cell(self, cell, convert_float: bool) -> Scalar:
+        # Todo: there is no way to distinguish between floats and datetimes in pyxlsb
+        # This means that there is no way to read datetime types from an xlsb file yet
+        if cell.v is None:
+            return ""  # Prevents non-named columns from not showing up as Unnamed: i
+        if isinstance(cell.v, float) and convert_float:
+            val = int(cell.v)
+            if val == cell.v:
+                return val
+            else:
+                return float(cell.v)
+
+        return cell.v
+
+    def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
+        return [
+            [self._convert_cell(c, convert_float) for c in r]
+            for r in sheet.rows(sparse=False)
+        ]
diff --git a/pandas/tests/io/data/excel/blank.xlsb b/pandas/tests/io/data/excel/blank.xlsb
diff --git a/pandas/tests/io/data/excel/blank_with_header.xlsb b/pandas/tests/io/data/excel/blank_with_header.xlsb
diff --git a/pandas/tests/io/data/excel/test1.xlsb b/pandas/tests/io/data/excel/test1.xlsb
diff --git a/pandas/tests/io/data/excel/test2.xlsb b/pandas/tests/io/data/excel/test2.xlsb
diff --git a/pandas/tests/io/data/excel/test3.xlsb b/pandas/tests/io/data/excel/test3.xlsb
diff --git a/pandas/tests/io/data/excel/test4.xlsb b/pandas/tests/io/data/excel/test4.xlsb
diff --git a/pandas/tests/io/data/excel/test5.xlsb b/pandas/tests/io/data/excel/test5.xlsb
diff --git a/pandas/tests/io/data/excel/test_converters.xlsb b/pandas/tests/io/data/excel/test_converters.xlsb
diff --git a/pandas/tests/io/data/excel/test_index_name_pre17.xlsb b/pandas/tests/io/data/excel/test_index_name_pre17.xlsb
diff --git a/pandas/tests/io/data/excel/test_multisheet.xlsb b/pandas/tests/io/data/excel/test_multisheet.xlsb
diff --git a/pandas/tests/io/data/excel/test_squeeze.xlsb b/pandas/tests/io/data/excel/test_squeeze.xlsb
diff --git a/pandas/tests/io/data/excel/test_types.xlsb b/pandas/tests/io/data/excel/test_types.xlsb
diff --git a/pandas/tests/io/data/excel/testdateoverflow.xlsb b/pandas/tests/io/data/excel/testdateoverflow.xlsb
diff --git a/pandas/tests/io/data/excel/testdtype.xlsb b/pandas/tests/io/data/excel/testdtype.xlsb
diff --git a/pandas/tests/io/data/excel/testmultiindex.xlsb b/pandas/tests/io/data/excel/testmultiindex.xlsb
diff --git a/pandas/tests/io/data/excel/testskiprows.xlsb b/pandas/tests/io/data/excel/testskiprows.xlsb
diff --git a/pandas/tests/io/data/excel/times_1900.xlsb b/pandas/tests/io/data/excel/times_1900.xlsb
diff --git a/pandas/tests/io/data/excel/times_1904.xlsb b/pandas/tests/io/data/excel/times_1904.xlsb
diff --git a/pandas/tests/io/excel/conftest.py b/pandas/tests/io/excel/conftest.py
@@ -35,7 +35,7 @@ def df_ref(datapath):
     return df_ref
 
 
-@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods"])
+@pytest.fixture(params=[".xls", ".xlsx", ".xlsm", ".ods", ".xlsb"])
 def read_ext(request):
     """
     Valid extensions for reading Excel files.
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,3 +34,6 @@ dependencies: @@
       - xlsxwriter
       - xlwt
       - pyarrow>=0.15
+      - pip
+      - pip:
+        - pyxlsb