diff --git a/doc/source/install.rst b/doc/source/install.rst index 36a6c4038f8be..37263899c0521 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -250,7 +250,8 @@ Optional Dependencies * `matplotlib `__: for plotting * `statsmodels `__ * Needed for parts of :mod:`pandas.stats` -* `openpyxl `__, `xlrd/xlwt `__ +* `openpyxl `__, `xlrd/xlwt `__, `ezodf `__ + * ezodf supports Open Document Format spreadsheets (ods) * Needed for Excel I/O * `XlsxWriter `__ * Alternative Excel writer diff --git a/pandas/io/excel.py b/pandas/io/excel.py index d5258cb32e6e0..8f1d423800565 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -4,7 +4,10 @@ #---------------------------------------------------------------------- # ExcelFile class +from __future__ import print_function + import os +import re import datetime import abc import numpy as np @@ -28,6 +31,8 @@ _writer_extensions = ["xlsx", "xls", "xlsm"] _writers = {} +_readers = {} +_reader_extensions = {} def register_writer(klass): @@ -68,12 +73,176 @@ def get_writer(engine_name): raise ValueError("No Excel writer '%s'" % engine_name) +class BaseFile(object): + """Base class for excel readers + + A file class can be initialized even if the engine is not installed. + If the engine is not installed, io_class and workbook_factory are None. + When attempting to use open_workbook while workbook_factory is None, the + relevant ImportError is raised. + """ + + def __init__(self): + """Set the engine name, and extension. If the engine is not installed, + io_class and workbook_factory are both None. + """ + self.io_class = None + self.workbook_factory = None + + def open_workbook(self, *args, **kwargs): + """Explicitely load the engine again (and trigger an ImportError in the + process) if workbook_factory is set to None. + """ + # try to load the engine again and raise import error if required + if self.workbook_factory is None: + self.load_engine() + # just in case the user passes an already opened workbook of io_class + if len(args) > 0 and isinstance(args[0], self.io_class): + self.book = args[0] + else: + self.book = self.workbook_factory(*args, **kwargs) + return self.book + + def create_reader(self, io, engine=None): + """Create the appropriate reader object based on io and optionally + engine. + + Paratemeters + ------------ + io : string, file-like object or xlrd/ezodf workbook + If a string, expected to be a path to xls, xlsx, or ods file. + File-like objects or buffers are only supported for xlrd types. + + engine: string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None, xlrd, or ezodf + + Returns + ------- + engine : string + Engine used for reading the io + + book : object + The spreadsheet book class created by the engine. + """ + + if engine is not None: + try: + reader = _readers[engine] + except KeyError: + msg = 'Excel reader engine "%s" is not implemented' % engine + raise NotImplementedError(msg) + # load_engine throws the relevant import error if not installed + reader.load_engine() + reader.open_workbook(io) + return reader + + if isinstance(io, compat.string_types): + ext = io.split('.')[-1] + try: + reader = _reader_extensions[ext] + except KeyError: + msg = 'No reader implemented for extension "%s"' % ext + raise NotImplementedError(msg) + reader.load_engine() + if _is_url(io): + data = _urlopen(io).read() + reader.read_buffer(data) + else: + reader.open_workbook(io) + return reader + + # try to determine the reader type based on properties of installed + # reader modules. + for engine, reader in compat.iteritems(_readers): + # only try to import readers that have not been imported before + if reader.io_class is None: + try: + reader.load_engine() + # if the reader is not installed, the user could not have + # passed the corresponding reader.io_class, so it is safe + # to assume that io is not the current reader + except ImportError: + continue + # Does the io type match the currently selected reader? + if isinstance(io, reader.io_class): + reader.engine = engine + reader.book = io + return reader + # xlrd has some additional/alternative reading mechanisms: + elif engine=='xlrd' and hasattr(io, "read"): + # N.B. xlrd.Book has a read attribute too + reader.read_buffer(io.read()) + return reader + + raise ValueError('Must explicitly set engine if not passing in buffer ' + 'or path for io.') + + +class XLRDFile(BaseFile): + """File reader class for MS Excel spreadsheets (depends on xlrd) + """ + extensions = ['xls', 'xlsx', 'xlsm'] + engine = 'xlrd' + + def load_engine(self): + import xlrd # throw an ImportError if we need to + ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) + if ver < (0, 9): # pragma: no cover + raise ImportError("pandas requires xlrd >= 0.9.0 for excel " + "support, current version " + xlrd.__VERSION__) + else: + self.workbook_factory = xlrd.open_workbook + self.io_class = xlrd.Book + + def read_buffer(self, data): + """Read from a buffer + """ + return self.open_workbook(file_contents=data) + + +class EZODFFile(BaseFile): + """File reader class for ODF spreadsheets (depends on ezodf) + """ + extensions = ['ods'] + engine = 'ezodf' + + def load_engine(self): + import ezodf # throw an ImportError if we need to + self.workbook_factory = ezodf.opendoc + self.io_class = ezodf.document.PackagedDocument + + def read_buffer(self, *args, **kwargs): + """ + """ + msg = 'Can not read ODF spreadsheet from a buffer or URL.' + raise NotImplementedError(msg) + + +# register all supported readers +def register_readers(): + """ + Establish which readers are supported and/or installed. + """ + + def populate(reader): + _readers[reader.engine] = reader + for ext in reader.extensions: + _reader_extensions[ext] = reader + + populate(XLRDFile()) + populate(EZODFFile()) + +register_readers() + + def read_excel(io, sheetname=0, **kwds): - """Read an Excel table into a pandas DataFrame + """Read an Excel/ods table into a pandas DataFrame Parameters ---------- - io : string, file-like object, or xlrd workbook. + io : string, file-like object, or xlrd workbook for MS Excel files. For an + ods file (Open Document Formant), string or ezodf workbook is required. The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file://localhost/path/to/workbook.xlsx @@ -106,7 +275,7 @@ def read_excel(io, sheetname=0, **kwds): converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels, values are functions that take one - input argument, the Excel cell content, and return the transformed + input argument, the Excel/ods cell content, and return the transformed content. index_col : int, default None Column to use as the row labels of the DataFrame. Pass None if @@ -126,10 +295,10 @@ def read_excel(io, sheetname=0, **kwds): Indicate number of NA values placed in non-numeric columns engine: string, default None If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd + Acceptable values are None, xlrd, or ezodf convert_float : boolean, default True convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric - data will be read in as floats: Excel stores all numbers as floats + data will be read in as floats: Excel/ods stores all numbers as floats internally has_index_names : boolean, default False True if the cols defined in index_col have an index name and are @@ -151,47 +320,21 @@ def read_excel(io, sheetname=0, **kwds): class ExcelFile(object): """ Class for parsing tabular excel sheets into DataFrame objects. - Uses xlrd. See ExcelFile.parse for more documentation + Uses xlrd and/or ezodf. See ExcelFile.parse for more documentation Parameters ---------- - io : string, file-like object or xlrd workbook - If a string, expected to be a path to xls or xlsx file + io : string, file-like object or xlrd/ezodf workbook + If a string, expected to be a path to xls, xlsx, or ods file engine: string, default None If io is not a buffer or path, this must be set to identify io. - Acceptable values are None or xlrd + Acceptable values are None, xlrd, or ezodf """ def __init__(self, io, **kwds): - - import xlrd # throw an ImportError if we need to - - ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2])) - if ver < (0, 9): # pragma: no cover - raise ImportError("pandas requires xlrd >= 0.9.0 for excel " - "support, current version " + xlrd.__VERSION__) - self.io = io engine = kwds.pop('engine', None) - - if engine is not None and engine != 'xlrd': - raise ValueError("Unknown engine: %s" % engine) - - if isinstance(io, compat.string_types): - if _is_url(io): - data = _urlopen(io).read() - self.book = xlrd.open_workbook(file_contents=data) - else: - self.book = xlrd.open_workbook(io) - elif engine == 'xlrd' and isinstance(io, xlrd.Book): - self.book = io - elif not isinstance(io, xlrd.Book) and hasattr(io, "read"): - # N.B. xlrd.Book has a read attribute too - data = io.read() - self.book = xlrd.open_workbook(file_contents=data) - else: - raise ValueError('Must explicitly set engine if not passing in' - ' buffer or path for io.') + self.reader = BaseFile().create_reader(io, engine=engine) def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, parse_cols=None, parse_dates=False, @@ -270,18 +413,25 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, if skipfooter is not None: skip_footer = skipfooter - return self._parse_excel(sheetname=sheetname, header=header, - skiprows=skiprows, - index_col=index_col, - has_index_names=has_index_names, - parse_cols=parse_cols, - parse_dates=parse_dates, - date_parser=date_parser, na_values=na_values, - thousands=thousands, chunksize=chunksize, - skip_footer=skip_footer, - convert_float=convert_float, - converters=converters, - **kwds) + if self.reader.engine == 'ezodf': + parser = self._parse_ods + elif self.reader.engine == 'xlrd': + parser = self._parse_excel + else: + raise ValueError('Engine is not specified.') + + return parser(sheetname=sheetname, header=header, + skiprows=skiprows, + index_col=index_col, + has_index_names=has_index_names, + parse_cols=parse_cols, + parse_dates=parse_dates, + date_parser=date_parser, na_values=na_values, + thousands=thousands, chunksize=chunksize, + skip_footer=skip_footer, + convert_float=convert_float, + converters=converters, + **kwds) def _should_parse(self, i, parse_cols): @@ -326,9 +476,9 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) - epoch1904 = self.book.datemode + epoch1904 = self.reader.book.datemode - def _parse_cell(cell_contents,cell_typ): + def _parse_cell(cell_contents, cell_typ): """converts the contents of the cell into a pandas appropriate object""" @@ -377,7 +527,7 @@ def _parse_cell(cell_contents,cell_typ): ret_dict = False - #Keep sheetname to maintain backwards compatibility. + # Keep sheetname to maintain backwards compatibility. if isinstance(sheetname, list): sheets = sheetname ret_dict = True @@ -387,7 +537,7 @@ def _parse_cell(cell_contents,cell_typ): else: sheets = [sheetname] - #handle same-type duplicates. + # handle same-type duplicates. sheets = list(set(sheets)) output = {} @@ -397,9 +547,9 @@ def _parse_cell(cell_contents,cell_typ): print("Reading sheet %s" % asheetname) if isinstance(asheetname, compat.string_types): - sheet = self.book.sheet_by_name(asheetname) + sheet = self.reader.book.sheet_by_name(asheetname) else: # assume an integer if not a string - sheet = self.book.sheet_by_index(asheetname) + sheet = self.reader.book.sheet_by_index(asheetname) data = [] should_parse = {} @@ -412,7 +562,7 @@ def _parse_cell(cell_contents,cell_typ): should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: - row.append(_parse_cell(value,typ)) + row.append(_parse_cell(value, typ)) data.append(row) if sheet.nrows == 0: @@ -439,10 +589,195 @@ def _parse_cell(cell_contents,cell_typ): else: return output[asheetname] + def _parse_ods(self, sheetname=0, header=0, skiprows=None, skip_footer=0, + index_col=None, has_index_names=None, parse_cols=None, + parse_dates=False, date_parser=None, na_values=None, + thousands=None, chunksize=None, convert_float=True, + verbose=False, **kwds): + # adds support for parsing ODS files, see PR #9070 + + def _parse_cell(cell): + """converts the contents of the cell into a pandas + appropriate object""" + if isinstance(cell.value, float): + value = cell.value + if convert_float: + # GH5394 - Excel and ODS 'numbers' are always floats + # it's a minimal perf hit and less suprising + # FIXME: this goes wrong when int(cell.value) returns + # a long (>1e18) + val = int(cell.value) + if val == cell.value: + value = val + elif isinstance(cell.value, compat.string_types): + typ = cell.value_type + if typ == 'date' or typ == 'time': + value = self._parse_datetime(cell) + else: + value = cell.value + elif isinstance(cell.value, bool): + value = cell.value + # empty cells have None as value, type, currency, formula. + # xlrd assigns empty string to empty cells, ezodf assigns None + # test_excel.ExcelReaderTests.test_reader_converters expects empty + # cells to be an empty string + elif isinstance(cell.value, type(None)): + value = '' + else: + value = np.nan + return value + + ret_dict = False + # find numbers for the date/time object conversion + self.regex = re.compile('[[0-9]*[\\.[0-9]+]*]*') + + # Keep sheetname to maintain backwards compatibility. + if isinstance(sheetname, list): + sheets = sheetname + ret_dict = True + elif sheetname is None: + sheets = self.sheet_names + ret_dict = True + else: + sheets = [sheetname] + + # handle same-type duplicates. + sheets = list(set(sheets)) + + output = {} + + for asheetname in sheets: + if verbose: + print("Reading sheet %s" % asheetname) + + # sheetname can be index or string + sheet = self.reader.book.sheets[asheetname] + + data = [] + should_parse = {} + for i in range(sheet.nrows()): + row = [] + for j, cell in enumerate(sheet.row(i)): + + if parse_cols is not None and j not in should_parse: + should_parse[j] = self._should_parse(j, parse_cols) + + if parse_cols is None or should_parse[j]: + row.append(_parse_cell(cell)) + + data.append(row) + + parser = TextParser(data, header=header, index_col=index_col, + has_index_names=has_index_names, + na_values=na_values, + thousands=thousands, + parse_dates=parse_dates, + date_parser=date_parser, + skiprows=skiprows, + skip_footer=skip_footer, + chunksize=chunksize, + **kwds) + output[asheetname] = parser.read() + + if ret_dict: + return output + else: + return output[asheetname] + + def _parse_datetime(self, cell): + """Parse the date or time from on ods cell to a datetime object. + Formats returned by ezodf are documented here: + https://pythonhosted.org/ezodf/tableobjects.html#cell-class + + Because time cells can also be timedeltas, all time fields that exceed + 23 hours are converted to a timedelta object. + + Date string value formats: 'yyyy-mm-dd' or 'yyyy-mm-ddThh:mm:ss' + + Time string value format: 'PThhHmmMss,ffffS' + """ + + def _sec_split_micro(seconds): + """Split a floatingpoint second value into an integer second value + and an integer microsecond value. + """ + sec = float(seconds) + sec_i = int(sec) + microsec = int(round((sec - sec_i)*1e6, 0)) + return sec_i, microsec + + def _timedelta(items): + """ + Possible formats for formulas are: + 'of:=TIME(%H;%M;%S)' + 'of:=TIME(%H;%M;%S.%fS)' + Possible formats for values are: + 'PT%HH%MM%S.%fS' + 'PT%HH%MM%SS' + """ + hours, minutes, seconds = items + return datetime.timedelta(hours=int(hours), minutes=int(minutes), + seconds=float(seconds)) + + def _time(items): + hours, minutes, seconds = items + sec_i, microsec = _sec_split_micro(seconds) + return datetime.time(int(hours), int(minutes), sec_i, microsec) + + def _datetime(items): + """ + Possible formats for values are: + '%Y-%m-%d' + '%Y-%m-%dT%H:%M:%S' + '%Y-%m-%dT%H:%M:%S.%f' + """ + + if len(items) == 3: + year, month, day = [int(k) for k in items] + return datetime.datetime(year, month, day) + else: + year, month, day, hours, minutes = [int(k) for k in items[:-1]] + # seconds can be a float, convert to microseconds + sec_i, microsec = _sec_split_micro(items[-1]) + return datetime.datetime(year, month, day, hours, minutes, + sec_i, microsec) + + # Only consider the value fields, formula's can contain just cell refs. + # Note that cell formatting determines if a value type is time, date or + # just a number. By using the cell value, cell type is consistent with + # what the user will see/format in LibreOffice + items = self.regex.findall(cell.value) + if cell.value_type == 'date': + value = _datetime(items) + else: + try: + # will fail when hours > 23, which is possible in LibreOffice + value = _time(items) + except ValueError: + value = _timedelta(items) + + return value + + def _print_ods_cellinfo(self, cell): + """Convienent for debugging purposes: print all ods cell data. + Cell attributes are documented here: + https://pythonhosted.org/ezodf/tableobjects.html#id2 + """ + print(' plaintext:', cell.plaintext()) # no formatting + # formatted, but what is difference with value? + print('display_form:', cell.display_form) # format, ?=plaintext + print(' value:', cell.value) # data handled + print(' value_type:', cell.value_type) # data type + print(' formula:', cell.formula) + print(' currency:', cell.currency) @property def sheet_names(self): - return self.book.sheet_names() + if self.reader.engine == 'ezodf': + # book.sheet.names() is a generator for ezodf + return [sheetname for sheetname in self.reader.book.sheets.names()] + else: + return self.reader.book.sheet_names() def close(self): """close io if necessary""" @@ -1174,7 +1509,7 @@ class _XlwtWriter(ExcelWriter): def __init__(self, path, engine=None, encoding=None, **engine_kwargs): # Use the xlwt module as the Excel writer. import xlwt - engine_kwargs['engine'] = engine + super(_XlwtWriter, self).__init__(path, **engine_kwargs) if encoding is None: diff --git a/pandas/io/tests/data/test1.ods b/pandas/io/tests/data/test1.ods new file mode 100644 index 0000000000000..d07a979deb576 Binary files /dev/null and b/pandas/io/tests/data/test1.ods differ diff --git a/pandas/io/tests/data/test.xls b/pandas/io/tests/data/test1.xls similarity index 100% rename from pandas/io/tests/data/test.xls rename to pandas/io/tests/data/test1.xls diff --git a/pandas/io/tests/data/test.xlsm b/pandas/io/tests/data/test1.xlsm similarity index 100% rename from pandas/io/tests/data/test.xlsm rename to pandas/io/tests/data/test1.xlsm diff --git a/pandas/io/tests/data/test.xlsx b/pandas/io/tests/data/test1.xlsx similarity index 100% rename from pandas/io/tests/data/test.xlsx rename to pandas/io/tests/data/test1.xlsx diff --git a/pandas/io/tests/data/test2.ods b/pandas/io/tests/data/test2.ods new file mode 100644 index 0000000000000..35bfff5220245 Binary files /dev/null and b/pandas/io/tests/data/test2.ods differ diff --git a/pandas/io/tests/data/test2.xlsm b/pandas/io/tests/data/test2.xlsm new file mode 100644 index 0000000000000..31cfba7ede082 Binary files /dev/null and b/pandas/io/tests/data/test2.xlsm differ diff --git a/pandas/io/tests/data/test2.xlsx b/pandas/io/tests/data/test2.xlsx index 441db5e55e666..94dd951e0bb84 100644 Binary files a/pandas/io/tests/data/test2.xlsx and b/pandas/io/tests/data/test2.xlsx differ diff --git a/pandas/io/tests/data/test3.ods b/pandas/io/tests/data/test3.ods new file mode 100644 index 0000000000000..4e072a231bccf Binary files /dev/null and b/pandas/io/tests/data/test3.ods differ diff --git a/pandas/io/tests/data/test3.xlsm b/pandas/io/tests/data/test3.xlsm new file mode 100644 index 0000000000000..54b7ef456a9ea Binary files /dev/null and b/pandas/io/tests/data/test3.xlsm differ diff --git a/pandas/io/tests/data/test3.xlsx b/pandas/io/tests/data/test3.xlsx new file mode 100644 index 0000000000000..c16755c25fabd Binary files /dev/null and b/pandas/io/tests/data/test3.xlsx differ diff --git a/pandas/io/tests/data/test4.ods b/pandas/io/tests/data/test4.ods new file mode 100644 index 0000000000000..71a12f04674e9 Binary files /dev/null and b/pandas/io/tests/data/test4.ods differ diff --git a/pandas/io/tests/data/test4.xls b/pandas/io/tests/data/test4.xls new file mode 100644 index 0000000000000..0e6f4331e2547 Binary files /dev/null and b/pandas/io/tests/data/test4.xls differ diff --git a/pandas/io/tests/data/test4.xlsm b/pandas/io/tests/data/test4.xlsm new file mode 100644 index 0000000000000..52328c7b28be9 Binary files /dev/null and b/pandas/io/tests/data/test4.xlsm differ diff --git a/pandas/io/tests/data/test4.xlsx b/pandas/io/tests/data/test4.xlsx new file mode 100644 index 0000000000000..441db5e55e666 Binary files /dev/null and b/pandas/io/tests/data/test4.xlsx differ diff --git a/pandas/io/tests/data/test_converters.ods b/pandas/io/tests/data/test_converters.ods new file mode 100644 index 0000000000000..3ebb8423daa89 Binary files /dev/null and b/pandas/io/tests/data/test_converters.ods differ diff --git a/pandas/io/tests/data/test_converters.xlsm b/pandas/io/tests/data/test_converters.xlsm new file mode 100644 index 0000000000000..eaf0b1d0219c5 Binary files /dev/null and b/pandas/io/tests/data/test_converters.xlsm differ diff --git a/pandas/io/tests/data/test_datetime.ods b/pandas/io/tests/data/test_datetime.ods new file mode 100644 index 0000000000000..165202a3fb731 Binary files /dev/null and b/pandas/io/tests/data/test_datetime.ods differ diff --git a/pandas/io/tests/data/test_multisheet.ods b/pandas/io/tests/data/test_multisheet.ods new file mode 100644 index 0000000000000..275f5350fe853 Binary files /dev/null and b/pandas/io/tests/data/test_multisheet.ods differ diff --git a/pandas/io/tests/data/test_multisheet.xls b/pandas/io/tests/data/test_multisheet.xls new file mode 100644 index 0000000000000..fa37723fcdefb Binary files /dev/null and b/pandas/io/tests/data/test_multisheet.xls differ diff --git a/pandas/io/tests/data/test_multisheet.xlsm b/pandas/io/tests/data/test_multisheet.xlsm new file mode 100644 index 0000000000000..694f8e07d5e29 Binary files /dev/null and b/pandas/io/tests/data/test_multisheet.xlsm differ diff --git a/pandas/io/tests/data/test_types.ods b/pandas/io/tests/data/test_types.ods new file mode 100644 index 0000000000000..bcf5433102f78 Binary files /dev/null and b/pandas/io/tests/data/test_types.ods differ diff --git a/pandas/io/tests/data/test_types.xlsm b/pandas/io/tests/data/test_types.xlsm new file mode 100644 index 0000000000000..c66fdc82dfb67 Binary files /dev/null and b/pandas/io/tests/data/test_types.xlsm differ diff --git a/pandas/io/tests/data/test_types_datetime.ods b/pandas/io/tests/data/test_types_datetime.ods new file mode 100644 index 0000000000000..b010d02fb9949 Binary files /dev/null and b/pandas/io/tests/data/test_types_datetime.ods differ diff --git a/pandas/io/tests/data/times_1900.xlsm b/pandas/io/tests/data/times_1900.xlsm new file mode 100644 index 0000000000000..1ffdbe223453b Binary files /dev/null and b/pandas/io/tests/data/times_1900.xlsm differ diff --git a/pandas/io/tests/data/times_1900.xlsx b/pandas/io/tests/data/times_1900.xlsx new file mode 100644 index 0000000000000..3702289b256fd Binary files /dev/null and b/pandas/io/tests/data/times_1900.xlsx differ diff --git a/pandas/io/tests/data/times_1904.xlsm b/pandas/io/tests/data/times_1904.xlsm new file mode 100644 index 0000000000000..e884eca1e7c74 Binary files /dev/null and b/pandas/io/tests/data/times_1904.xlsm differ diff --git a/pandas/io/tests/data/times_1904.xlsx b/pandas/io/tests/data/times_1904.xlsx new file mode 100644 index 0000000000000..1a13468e59d1c Binary files /dev/null and b/pandas/io/tests/data/times_1904.xlsx differ diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 83db59f9d9029..4a3d6d18616c0 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -1,7 +1,8 @@ # pylint: disable=E1101 from pandas.compat import u, range, map, openpyxl_compat, BytesIO, iteritems -from datetime import datetime, date, time +from pandas import compat +from datetime import datetime, date, time, timedelta import sys import os from distutils.version import LooseVersion @@ -58,6 +59,13 @@ def _skip_if_no_xlsxwriter(): raise nose.SkipTest('xlsxwriter not installed, skipping') +def _skip_if_no_ezodf(): + try: + import ezodf # NOQA + except ImportError: + raise nose.SkipTest('ezodf not installed, skipping') + + def _skip_if_no_excelsuite(): _skip_if_no_xlrd() _skip_if_no_xlwt() @@ -76,11 +84,6 @@ def _skip_if_no_excelsuite(): class SharedItems(object): def setUp(self): self.dirpath = tm.get_data_path() - self.csv1 = os.path.join(self.dirpath, 'test1.csv') - self.csv2 = os.path.join(self.dirpath, 'test2.csv') - self.xls1 = os.path.join(self.dirpath, 'test.xls') - self.xlsx1 = os.path.join(self.dirpath, 'test.xlsx') - self.multisheet = os.path.join(self.dirpath, 'test_multisheet.xlsx') self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() @@ -91,266 +94,176 @@ def read_csv(self, *args, **kwds): kwds['engine'] = 'python' return read_csv(*args, **kwds) + def get_data(self, basename, csv=True): + """ + Return a DataFrame as read by the Python csv engine and a DataFrame + as read by the ExcelFile engine. Test data path is defined by + pandas.util.testing.get_data_path() + + Parameters + ---------- + + basename : str + File base name, excluding file extension. + + csv : boolean, default=True + When True, basename.csv is returned + """ + + excel = ExcelFile(os.path.join(self.dirpath, basename + self.ext)) + if csv: + # the reference is obtained form read_csv with Python engine + pref = os.path.join(self.dirpath, basename + '.csv') + dfref = self.read_csv(pref, index_col=0, parse_dates=True) + return dfref, excel + else: + return excel + + +class ReadingTestsBase(SharedItems): + # This is based on ExcelWriterBase + # + # Base class for test cases to run with different Excel readers. + # To add a reader test, define the following: + # 1. A check_skip function that skips your tests if your reader isn't + # installed. + # 2. Add a property ext, which is the file extension that your reader + # reades from. (needs to start with '.' so it's a valid path) + # 3. Add a property engine_name, which is the name of the reader class. + # For the reader this is not used for anything at the moment. + + def setUp(self): + self.check_skip() + super(ReadingTestsBase, self).setUp() -class ExcelReaderTests(SharedItems, tm.TestCase): def test_parse_cols_int(self): - _skip_if_no_openpyxl() - _skip_if_no_xlrd() - suffix = ['xls', 'xlsx', 'xlsm'] - - for s in suffix: - pth = os.path.join(self.dirpath, 'test.%s' % s) - xls = ExcelFile(pth) - df = xls.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols=3) - df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = df2.reindex(columns=['A', 'B', 'C']) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, - parse_dates=True, parse_cols=3) - # TODO add index to xls file) - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) + dfref, excel = self.get_data('test1') + dfref = dfref.reindex(columns=['A', 'B', 'C']) + df1 = excel.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols=3) + df2 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, parse_cols=3) + # TODO add index to xls file) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) def test_parse_cols_list(self): - _skip_if_no_openpyxl() - _skip_if_no_xlrd() - suffix = ['xls', 'xlsx', 'xlsm'] - - for s in suffix: - pth = os.path.join(self.dirpath, 'test.%s' % s) - xls = ExcelFile(pth) - df = xls.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols=[0, 2, 3]) - df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = df2.reindex(columns=['B', 'C']) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, - parse_dates=True, - parse_cols=[0, 2, 3]) - # TODO add index to xls file) - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) + dfref, excel = self.get_data('test1') + dfref = dfref.reindex(columns=['B', 'C']) + df1 = excel.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols=[0, 2, 3]) + df2 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, + parse_cols=[0, 2, 3]) + # TODO add index to xls file) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) def test_parse_cols_str(self): - _skip_if_no_openpyxl() - _skip_if_no_xlrd() - suffix = ['xls', 'xlsx', 'xlsm'] - - for s in suffix: - - pth = os.path.join(self.dirpath, 'test.%s' % s) - xls = ExcelFile(pth) - - df = xls.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols='A:D') - df2 = read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = df2.reindex(columns=['A', 'B', 'C']) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, - parse_dates=True, parse_cols='A:D') - # TODO add index to xls, read xls ignores index name ? - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) - del df, df2, df3 - - df = xls.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols='A,C,D') - df2 = read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = df2.reindex(columns=['B', 'C']) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, - parse_dates=True, - parse_cols='A,C,D') - # TODO add index to xls file - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) - del df, df2, df3 - - df = xls.parse('Sheet1', index_col=0, parse_dates=True, - parse_cols='A,C:D') - df2 = read_csv(self.csv1, index_col=0, parse_dates=True) - df2 = df2.reindex(columns=['B', 'C']) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, - parse_dates=True, - parse_cols='A,C:D') - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) + dfref, excel = self.get_data('test1') + + df1 = dfref.reindex(columns=['A', 'B', 'C']) + df2 = excel.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A:D') + df3 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, parse_cols='A:D') + # TODO add index to xls, read xls ignores index name ? + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + df1 = dfref.reindex(columns=['B', 'C']) + df2 = excel.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A,C,D') + df3 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, + parse_cols='A,C,D') + # TODO add index to xls file + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + df1 = dfref.reindex(columns=['B', 'C']) + df2 = excel.parse('Sheet1', index_col=0, parse_dates=True, + parse_cols='A,C:D') + df3 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True, + parse_cols='A,C:D') + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) def test_excel_stop_iterator(self): - _skip_if_no_xlrd() - excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls')) - parsed = excel_data.parse('Sheet1') + excel = self.get_data('test2', csv=False) + + parsed = excel.parse('Sheet1') expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) tm.assert_frame_equal(parsed, expected) def test_excel_cell_error_na(self): - _skip_if_no_xlrd() - excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls')) - parsed = excel_data.parse('Sheet1') + excel = self.get_data('test3', csv=False) + + parsed = excel.parse('Sheet1') expected = DataFrame([[np.nan]], columns=['Test']) tm.assert_frame_equal(parsed, expected) def test_excel_passes_na(self): - _skip_if_no_xlrd() - excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xlsx')) - parsed = excel_data.parse('Sheet1', keep_default_na=False, - na_values=['apple']) + excel = self.get_data('test4', csv=False) + + parsed = excel.parse('Sheet1', keep_default_na=False, + na_values=['apple']) expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) - parsed = excel_data.parse('Sheet1', keep_default_na=True, - na_values=['apple']) + parsed = excel.parse('Sheet1', keep_default_na=True, + na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) - def check_excel_table_sheet_by_index(self, filename, csvfile): - import xlrd - - pth = os.path.join(self.dirpath, filename) - xls = ExcelFile(pth) - df = xls.parse(0, index_col=0, parse_dates=True) - df2 = self.read_csv(csvfile, index_col=0, parse_dates=True) - df3 = xls.parse(1, skiprows=[1], index_col=0, parse_dates=True) - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) - - df4 = xls.parse(0, index_col=0, parse_dates=True, skipfooter=1) - df5 = xls.parse(0, index_col=0, parse_dates=True, skip_footer=1) - tm.assert_frame_equal(df4, df.ix[:-1]) - tm.assert_frame_equal(df4, df5) - - self.assertRaises(xlrd.XLRDError, xls.parse, 'asdf') - def test_excel_table_sheet_by_index(self): - _skip_if_no_xlrd() - for filename, csvfile in [(self.xls1, self.csv1), - (self.xlsx1, self.csv1)]: - self.check_excel_table_sheet_by_index(filename, csvfile) - - def test_excel_table(self): - _skip_if_no_xlrd() - - pth = os.path.join(self.dirpath, 'test.xls') - xls = ExcelFile(pth) - df = xls.parse('Sheet1', index_col=0, parse_dates=True) - df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) - df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) - - df4 = xls.parse('Sheet1', index_col=0, parse_dates=True, - skipfooter=1) - df5 = xls.parse('Sheet1', index_col=0, parse_dates=True, - skip_footer=1) - tm.assert_frame_equal(df4, df.ix[:-1]) - tm.assert_frame_equal(df4, df5) - - def test_excel_read_buffer(self): - _skip_if_no_xlrd() - _skip_if_no_openpyxl() - - pth = os.path.join(self.dirpath, 'test.xls') - f = open(pth, 'rb') - xls = ExcelFile(f) - # it works - xls.parse('Sheet1', index_col=0, parse_dates=True) - - pth = os.path.join(self.dirpath, 'test.xlsx') - f = open(pth, 'rb') - xl = ExcelFile(f) - xl.parse('Sheet1', index_col=0, parse_dates=True) - - def test_read_xlrd_Book(self): - _skip_if_no_xlrd() - _skip_if_no_xlwt() - - import xlrd - - df = self.frame - - with ensure_clean('.xls') as pth: - df.to_excel(pth, "SheetA") - book = xlrd.open_workbook(pth) - - with ExcelFile(book, engine="xlrd") as xl: - result = xl.parse("SheetA") - tm.assert_frame_equal(df, result) - result = read_excel(book, sheetname="SheetA", engine="xlrd") - tm.assert_frame_equal(df, result) + dfref, excel = self.get_data('test1') - @tm.network - def test_read_from_http_url(self): - _skip_if_no_xlrd() + df1 = excel.parse(0, index_col=0, parse_dates=True) + df2 = excel.parse(1, skiprows=[1], index_col=0, parse_dates=True) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) - url = ('https://raw.github.com/pydata/pandas/master/' - 'pandas/io/tests/data/test.xlsx') - url_table = read_excel(url) - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'test.xlsx') - local_table = read_excel(localtable) - tm.assert_frame_equal(url_table, local_table) - - @slow - def test_read_from_file_url(self): - _skip_if_no_xlrd() + df3 = excel.parse(0, index_col=0, parse_dates=True, skipfooter=1) + df4 = excel.parse(0, index_col=0, parse_dates=True, skip_footer=1) + tm.assert_frame_equal(df3, df1.ix[:-1]) + tm.assert_frame_equal(df3, df4) - # FILE - if sys.version_info[:2] < (2, 6): - raise nose.SkipTest("file:// not supported with Python < 2.6") - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'test.xlsx') - local_table = read_excel(localtable) - - try: - url_table = read_excel('file://localhost/' + localtable) - except URLError: - # fails on some systems - raise nose.SkipTest("failing on %s" % - ' '.join(platform.uname()).strip()) - - tm.assert_frame_equal(url_table, local_table) - - def test_xlsx_table(self): - _skip_if_no_xlrd() - _skip_if_no_openpyxl() - - pth = os.path.join(self.dirpath, 'test.xlsx') - xlsx = ExcelFile(pth) - df = xlsx.parse('Sheet1', index_col=0, parse_dates=True) - df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True) - df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True) - - # TODO add index to xlsx file - tm.assert_frame_equal(df, df2, check_names=False) - tm.assert_frame_equal(df3, df2, check_names=False) + if self.ext == '.ods': + self.assertRaises(KeyError, excel.parse, 'asdf') + else: + import xlrd + self.assertRaises(xlrd.XLRDError, excel.parse, 'asdf') - df4 = xlsx.parse('Sheet1', index_col=0, parse_dates=True, - skipfooter=1) - df5 = xlsx.parse('Sheet1', index_col=0, parse_dates=True, - skip_footer=1) - tm.assert_frame_equal(df4, df.ix[:-1]) - tm.assert_frame_equal(df4, df5) + def test_excel_table(self): - def test_reader_closes_file(self): - _skip_if_no_xlrd() - _skip_if_no_openpyxl() + dfref, excel = self.get_data('test1') - pth = os.path.join(self.dirpath, 'test.xlsx') - f = open(pth, 'rb') - with ExcelFile(f) as xlsx: - # parses okay - xlsx.parse('Sheet1', index_col=0) + df1 = excel.parse('Sheet1', index_col=0, parse_dates=True) + df2 = excel.parse('Sheet2', skiprows=[1], index_col=0, + parse_dates=True) + # TODO add index to file + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) - self.assertTrue(f.closed) + df3 = excel.parse('Sheet1', index_col=0, parse_dates=True, + skipfooter=1) + df4 = excel.parse('Sheet1', index_col=0, parse_dates=True, + skip_footer=1) + tm.assert_frame_equal(df3, df1.ix[:-1]) + tm.assert_frame_equal(df3, df4) def test_reader_special_dtypes(self): - _skip_if_no_xlrd() expected = DataFrame.from_items([ ("IntCol", [1, 2, -3, 4, 0]), @@ -364,44 +277,40 @@ def test_reader_special_dtypes(self): datetime(2015, 3, 14)]) ]) - xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx') - xls_path = os.path.join(self.dirpath, 'test_types.xls') + pth = os.path.join(self.dirpath, 'test_types' + self.ext) # should read in correctly and infer types - for path in (xls_path, xlsx_path): - actual = read_excel(path, 'Sheet1') - tm.assert_frame_equal(actual, expected) + actual = read_excel(pth, 'Sheet1') + tm.assert_frame_equal(actual, expected) # if not coercing number, then int comes in as float float_expected = expected.copy() float_expected["IntCol"] = float_expected["IntCol"].astype(float) float_expected.loc[1, "Str2Col"] = 3.0 - for path in (xls_path, xlsx_path): - actual = read_excel(path, 'Sheet1', convert_float=False) - tm.assert_frame_equal(actual, float_expected) + actual = read_excel(pth, 'Sheet1', convert_float=False) + tm.assert_frame_equal(actual, float_expected) # check setting Index (assuming xls and xlsx are the same here) for icol, name in enumerate(expected.columns): - actual = read_excel(xlsx_path, 'Sheet1', index_col=icol) - actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name) + actual = read_excel(pth, 'Sheet1', index_col=icol) exp = expected.set_index(name) tm.assert_frame_equal(actual, exp) - tm.assert_frame_equal(actual2, exp) # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) - actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}) + actual = read_excel(pth, 'Sheet1', converters={"StrCol": str}) tm.assert_frame_equal(actual, expected) no_convert_float = float_expected.copy() no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str}, + actual = read_excel(pth, 'Sheet1', converters={"StrCol": str}, convert_float=False) tm.assert_frame_equal(actual, no_convert_float) # GH8212 - support for converters and missing values def test_reader_converters(self): - _skip_if_no_xlrd() + + pth = os.path.join(self.dirpath, 'test_converters' + self.ext) expected = DataFrame.from_items([ ("IntCol", [1, 2, -3, -1000, 0]), @@ -416,48 +325,166 @@ def test_reader_converters(self): 3: lambda x: str(x) if x else '', } - xlsx_path = os.path.join(self.dirpath, 'test_converters.xlsx') - xls_path = os.path.join(self.dirpath, 'test_converters.xls') - # should read in correctly and set types of single cells (not array dtypes) - for path in (xls_path, xlsx_path): - actual = read_excel(path, 'Sheet1', converters=converters) - tm.assert_frame_equal(actual, expected) + actual = read_excel(pth, 'Sheet1', converters=converters) + tm.assert_frame_equal(actual, expected) def test_reading_all_sheets(self): # Test reading all sheetnames by setting sheetname to None, # Ensure a dict is returned. # See PR #9450 - - _skip_if_no_xlrd() - - dfs = read_excel(self.multisheet,sheetname=None) - expected_keys = ['Alpha','Beta','Charlie'] - tm.assert_contains_all(expected_keys,dfs.keys()) + pth = os.path.join(self.dirpath, 'test_multisheet' + self.ext) + dfs = read_excel(pth, sheetname=None) + expected_keys = ['Alpha', 'Beta', 'Charlie'] + tm.assert_contains_all(expected_keys, dfs.keys()) def test_reading_multiple_specific_sheets(self): # Test reading specific sheetnames by specifying a mixed list # of integers and strings, and confirm that duplicated sheet # references (positions/names) are removed properly. - # Ensure a dict is returned # See PR #9450 - _skip_if_no_xlrd() - - #Explicitly request duplicates. Only the set should be returned. - expected_keys = [2,'Charlie','Charlie'] - dfs = read_excel(self.multisheet,sheetname=expected_keys) + pth = os.path.join(self.dirpath, 'test_multisheet' + self.ext) + # Explicitly request duplicates. Only the set should be returned. + expected_keys = [2, 'Charlie', 'Charlie'] + dfs = read_excel(pth, sheetname=expected_keys) expected_keys = list(set(expected_keys)) - tm.assert_contains_all(expected_keys,dfs.keys()) + tm.assert_contains_all(expected_keys, dfs.keys()) assert len(expected_keys) == len(dfs.keys()) + +class OdsReaderTests(ReadingTestsBase, tm.TestCase): + ext = '.ods' + engine_name = 'ezodf' + check_skip = staticmethod(_skip_if_no_ezodf) + + def test_read_ezodf_book(self): + + import ezodf + pth = os.path.join(self.dirpath, 'test1' + self.ext) + book = ezodf.opendoc(pth) + result1 = ExcelFile(book).parse() + result2 = read_excel(book) + + df = read_excel(pth) + tm.assert_frame_equal(df, result1) + tm.assert_frame_equal(df, result2) + + def test_types_datetime(self): + + expected = DataFrame.from_items([ + ("UnicodeCol", ['øø', 'ææ', 'åå', 'oø', '€£$¥', '£@$', 'ÅøØæÆ@']), + ("ExpCol", [8.50E-010, 8.50E+012, 9.00E-055, 8.50E+011, 8.5E-10, + 5E-10, 5E-10]), + ("BoolCol", [True, False, True, True, False, False, False]), + ("TimeCol", [time(hour=23, microsecond=1), + time(hour=2), + time(hour=1, minute=1, second=1), + timedelta(days=1, hours=2, minutes=1, seconds=1, + microseconds=1), + timedelta(hours=866, minutes=1, seconds=1, + microseconds=1), + time(2, 59, 40, 500000), + time(23, 59, 59, 100)]), + ("DateTimeCol", [datetime(2014, 10, 10, 10), + datetime(1900, 2, 1, 2), + datetime(2014, 1, 1, 23, 15, 15), + datetime(2011, 2, 3, 4, 5, 6), + datetime(1900, 7, 8, 9, 0, 1), + datetime(2015, 5, 7, 9, 33, 23), + datetime(2015, 5, 7, 2, 33, 23, 300000)]), + ("DateCol", [datetime(2014,3,2), datetime(1900,2,1), + datetime(1899,12,30), datetime(2100,12,11), + datetime(1850,11,3), datetime(2950,11,3), + datetime(2015,7,6)]), + ("TimeInDateFormat", [datetime(1899,12,30,1) for k in range(7)]) + ]) + + pth = os.path.join(self.dirpath, 'test_types_datetime' + self.ext) + dfs = read_excel(pth) + tm.assert_frame_equal(dfs, expected) + + +class XlrdTests(ReadingTestsBase): + """ + This is the base class for the xlrd tests, and 3 different file formats + are supported: xls, xlsx, xlsm + """ + + def test_excel_read_buffer(self): + + pth = os.path.join(self.dirpath, 'test1' + self.ext) + f = open(pth, 'rb') + xls = ExcelFile(f) + # it works + xls.parse('Sheet1', index_col=0, parse_dates=True) + + def test_read_xlrd_Book(self): + _skip_if_no_xlwt() + + import xlrd + df = self.frame + with ensure_clean('.xls') as pth: + df.to_excel(pth, "SheetA") + book = xlrd.open_workbook(pth) + + with ExcelFile(book, engine="xlrd") as xl: + result = xl.parse("SheetA") + tm.assert_frame_equal(df, result) + + result = read_excel(book, sheetname="SheetA", engine="xlrd") + tm.assert_frame_equal(df, result) + + @tm.network + def test_read_from_http_url(self): + # TODO: remove this when merging into master + url = ('https://raw.github.com/davidovitch/pandas/master/' + 'pandas/io/tests/data/test1' + self.ext) +# url = ('https://raw.github.com/pydata/pandas/master/' +# 'pandas/io/tests/data/test' + self.ext) + url_table = read_excel(url) + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'test1' + self.ext) + local_table = read_excel(localtable) + tm.assert_frame_equal(url_table, local_table) + + @slow + def test_read_from_file_url(self): + + # FILE + if sys.version_info[:2] < (2, 6): + raise nose.SkipTest("file:// not supported with Python < 2.6") + dirpath = tm.get_data_path() + localtable = os.path.join(dirpath, 'test1' + self.ext) + local_table = read_excel(localtable) + + try: + url_table = read_excel('file://localhost/' + localtable) + except URLError: + # fails on some systems + import platform + raise nose.SkipTest("failing on %s" % + ' '.join(platform.uname()).strip()) + + tm.assert_frame_equal(url_table, local_table) + + def test_reader_closes_file(self): + + pth = os.path.join(self.dirpath, 'test1' + self.ext) + f = open(pth, 'rb') + with ExcelFile(f) as xlsx: + # parses okay + xlsx.parse('Sheet1', index_col=0) + + self.assertTrue(f.closed) + def test_creating_and_reading_multiple_sheets(self): # Test reading multiple sheets, from a runtime created excel file # with multiple sheets. # See PR #9450 - _skip_if_no_xlrd() _skip_if_no_xlwt() + _skip_if_no_openpyxl() def tdf(sheetname): d, i = [11,22,33], [1,2,3] @@ -468,9 +495,9 @@ def tdf(sheetname): dfs = [tdf(s) for s in sheets] dfs = dict(zip(sheets,dfs)) - with ensure_clean('.xlsx') as pth: + with ensure_clean(self.ext) as pth: with ExcelWriter(pth) as ew: - for sheetname, df in iteritems(dfs): + for sheetname, df in compat.iteritems(dfs): df.to_excel(ew,sheetname) dfs_returned = pd.read_excel(pth,sheetname=sheets) for s in sheets: @@ -478,7 +505,6 @@ def tdf(sheetname): def test_reader_seconds(self): # Test reading times with and without milliseconds. GH5945. - _skip_if_no_xlrd() import xlrd if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): @@ -510,8 +536,8 @@ def test_reader_seconds(self): time(16, 37, 1), time(18, 20, 54)])]) - epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls') - epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls') + epoch_1900 = os.path.join(self.dirpath, 'times_1900' + self.ext) + epoch_1904 = os.path.join(self.dirpath, 'times_1904' + self.ext) actual = read_excel(epoch_1900, 'Sheet1') tm.assert_frame_equal(actual, expected) @@ -543,6 +569,24 @@ def test_read_excel_blank_with_header(self): actual = read_excel(blank, 'Sheet1') tm.assert_frame_equal(actual, expected) +class XlsReaderTests(XlrdTests, tm.TestCase): + ext = '.xls' + engine_name = 'xlrd' + check_skip = staticmethod(_skip_if_no_xlrd) + + +class XlsxReaderTests(XlrdTests, tm.TestCase): + ext = '.xlsx' + engine_name = 'xlrd' + check_skip = staticmethod(_skip_if_no_xlrd) + + +class XlsmReaderTests(XlrdTests, tm.TestCase): + ext = '.xlsm' + engine_name = 'xlrd' + check_skip = staticmethod(_skip_if_no_xlrd) + + class ExcelWriterBase(SharedItems): # Base class for test cases to run with different Excel writers. # To add a writer test, define the following: @@ -1269,6 +1313,8 @@ def test_datetimes(self): # GH7074 def test_bytes_io(self): + _skip_if_no_xlrd() + bio = BytesIO() df = DataFrame(np.random.randn(10, 2)) writer = ExcelWriter(bio) @@ -1280,6 +1326,8 @@ def test_bytes_io(self): # GH8188 def test_write_lists_dict(self): + _skip_if_no_xlrd() + df = pd.DataFrame({'mixed': ['a', ['b', 'c'], {'d': 'e', 'f': 2}], 'numeric': [1, 2, 3.0], 'str': ['apple', 'banana', 'cherry']}) @@ -1291,6 +1339,7 @@ def test_write_lists_dict(self): read = read_excel(path, 'Sheet1', header=0) tm.assert_frame_equal(read, expected) + def raise_wrapper(major_ver): def versioned_raise_wrapper(orig_method): @functools.wraps(orig_method)