From d0188baacc71905b437a3b186c411b14ecff40a2 Mon Sep 17 00:00:00 2001 From: Thijs Damsma Date: Tue, 12 Feb 2019 09:15:28 +0100 Subject: [PATCH 1/4] Attempt to generalize _XlrdReader __init__ and move it to _BaseExcelReader --- pandas/io/excel/_base.py | 56 ++++++++++++++++++++++++++++++---------- pandas/io/excel/_xlrd.py | 41 ++++++++--------------------- 2 files changed, 54 insertions(+), 43 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ed5943e9a1698..d7eedf09a38a3 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,27 +1,27 @@ import abc +import os +import warnings from collections import OrderedDict from datetime import date, datetime, timedelta -import os +from io import BytesIO from textwrap import fill -import warnings import pandas.compat as compat from pandas.compat import add_metaclass, range, string_types, u -from pandas.errors import EmptyDataError -from pandas.util._decorators import Appender, deprecate_kwarg - -from pandas.core.dtypes.common import ( - is_bool, is_float, is_integer, is_list_like) - from pandas.core import config +from pandas.core.dtypes.common import (is_bool, is_float, is_integer, + is_list_like) from pandas.core.frame import DataFrame - -from pandas.io.common import _NA_VALUES, _stringify_path, _validate_header_arg -from pandas.io.excel._util import ( - _fill_mi_header, _get_default_writer, _maybe_convert_to_string, - _maybe_convert_usecols, _pop_header_name, get_writer) +from pandas.errors import EmptyDataError +from pandas.io.common import (_NA_VALUES, _is_url, _stringify_path, _urlopen, + _validate_header_arg, get_filepath_or_buffer) +from pandas.io.excel._util import (_fill_mi_header, _get_default_writer, + _maybe_convert_to_string, + _maybe_convert_usecols, _pop_header_name, + get_writer) from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import TextParser +from pandas.util._decorators import Appender, deprecate_kwarg _read_excel_doc = """ Read an Excel file into a pandas DataFrame. @@ -329,6 +329,36 @@ def read_excel(io, @add_metaclass(abc.ABCMeta) class _BaseExcelReader(object): + def __init__(self, filepath_or_buffer): + # If filepath_or_buffer is a url, load the data into a BytesIO + if _is_url(filepath_or_buffer): + filepath_or_buffer = BytesIO(_urlopen(filepath_or_buffer).read()) + elif not isinstance(filepath_or_buffer, + (ExcelFile, self._workbook_class)): + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer) + + if isinstance(filepath_or_buffer, self._workbook_class): + self.book = filepath_or_buffer + elif hasattr(filepath_or_buffer, "read"): + # N.B. xlrd.Book has a read attribute too + filepath_or_buffer.seek(0) + self.book = self.load_workbook(filepath_or_buffer) + elif isinstance(filepath_or_buffer, compat.string_types): + self.book = self.load_workbook(filepath_or_buffer) + else: + raise ValueError('Must explicitly set engine if not passing in' + ' buffer or path for io.') + + @property + @abc.abstractmethod + def _workbook_class(self): + pass + + @abc.abstractmethod + def load_workbook(self, filepath_or_buffer): + pass + @property @abc.abstractmethod def sheet_names(self): diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 60f7d8f94a399..0a083be39052d 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,13 +1,10 @@ from datetime import time from distutils.version import LooseVersion -from io import UnsupportedOperation +from io import BytesIO import numpy as np -import pandas.compat as compat from pandas.compat import range, zip - -from pandas.io.common import _is_url, _urlopen, get_filepath_or_buffer from pandas.io.excel._base import _BaseExcelReader @@ -32,35 +29,19 @@ def __init__(self, filepath_or_buffer): raise ImportError(err_msg + ". Current version " + xlrd.__VERSION__) - from pandas.io.excel._base import ExcelFile - # If filepath_or_buffer is a url, want to keep the data as bytes so - # can't pass to get_filepath_or_buffer() - if _is_url(filepath_or_buffer): - filepath_or_buffer = _urlopen(filepath_or_buffer) - elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer( - filepath_or_buffer) - - if isinstance(filepath_or_buffer, xlrd.Book): - self.book = filepath_or_buffer - elif hasattr(filepath_or_buffer, "read"): - # N.B. xlrd.Book has a read attribute too - if hasattr(filepath_or_buffer, 'seek'): - try: - # GH 19779 - filepath_or_buffer.seek(0) - except UnsupportedOperation: - # HTTPResponse does not support seek() - # GH 20434 - pass + self._engine = xlrd + super(_XlrdReader, self).__init__(filepath_or_buffer) + + @property + def _workbook_class(self): + return self._engine.Book + def load_workbook(self, filepath_or_buffer): + if isinstance(filepath_or_buffer, BytesIO): data = filepath_or_buffer.read() - self.book = xlrd.open_workbook(file_contents=data) - elif isinstance(filepath_or_buffer, compat.string_types): - self.book = xlrd.open_workbook(filepath_or_buffer) + return self._engine.open_workbook(file_contents=data) else: - raise ValueError('Must explicitly set engine if not passing in' - ' buffer or path for io.') + return self._engine.open_workbook(filepath_or_buffer) @property def sheet_names(self): From a77a4c7c953f3fcf56547debd9a685c0e2b5bd00 Mon Sep 17 00:00:00 2001 From: Thijs Damsma Date: Mon, 29 Apr 2019 08:48:16 +0200 Subject: [PATCH 2/4] implement suggestions @WillAyd --- pandas/io/excel/_xlrd.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 504932c3d72e0..dbb4030d88c34 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -29,11 +29,12 @@ def __init__(self, filepath_or_buffer): ". Current version " + xlrd.__VERSION__) self._engine = xlrd - super(_XlrdReader, self).__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer) @property def _workbook_class(self): - return self._engine.Book + from xlrd import Book + return Book def load_workbook(self, filepath_or_buffer): From 22e24bbae8636bcd4dec360cae7124cb9c6a3d34 Mon Sep 17 00:00:00 2001 From: Thijs Damsma Date: Mon, 29 Apr 2019 09:54:48 +0200 Subject: [PATCH 3/4] remove _engine keyword altogether --- pandas/io/excel/_xlrd.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index dbb4030d88c34..d772ad4bfedfc 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -28,7 +28,6 @@ def __init__(self, filepath_or_buffer): raise ImportError(err_msg + ". Current version " + xlrd.__VERSION__) - self._engine = xlrd super().__init__(filepath_or_buffer) @property @@ -37,12 +36,12 @@ def _workbook_class(self): return Book def load_workbook(self, filepath_or_buffer): - + from xlrd import open_workbook if isinstance(filepath_or_buffer, (BytesIO, BufferedReader)): data = filepath_or_buffer.read() - return self._engine.open_workbook(file_contents=data) + return open_workbook(file_contents=data) else: - return self._engine.open_workbook(filepath_or_buffer) + return open_workbook(filepath_or_buffer) @property def sheet_names(self): From 903b188f481f28a171d101388d7ea313f1ad847e Mon Sep 17 00:00:00 2001 From: Thijs Damsma Date: Tue, 30 Apr 2019 10:04:48 +0200 Subject: [PATCH 4/4] fix regression for reading s3 files --- pandas/io/excel/_xlrd.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index d772ad4bfedfc..18e751274dab9 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,6 +1,5 @@ from datetime import time from distutils.version import LooseVersion -from io import BufferedReader, BytesIO import numpy as np @@ -37,7 +36,7 @@ def _workbook_class(self): def load_workbook(self, filepath_or_buffer): from xlrd import open_workbook - if isinstance(filepath_or_buffer, (BytesIO, BufferedReader)): + if hasattr(filepath_or_buffer, "read"): data = filepath_or_buffer.read() return open_workbook(file_contents=data) else: