diff --git a/doc/source/install.rst b/doc/source/install.rst
index 36a6c4038f8be..37263899c0521 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -250,7 +250,8 @@ Optional Dependencies
* `matplotlib `__: for plotting
* `statsmodels `__
* Needed for parts of :mod:`pandas.stats`
-* `openpyxl `__, `xlrd/xlwt `__
+* `openpyxl `__, `xlrd/xlwt `__, `ezodf `__
+ * ezodf supports Open Document Format spreadsheets (ods)
* Needed for Excel I/O
* `XlsxWriter `__
* Alternative Excel writer
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
index d5258cb32e6e0..8f1d423800565 100644
--- a/pandas/io/excel.py
+++ b/pandas/io/excel.py
@@ -4,7 +4,10 @@
#----------------------------------------------------------------------
# ExcelFile class
+from __future__ import print_function
+
import os
+import re
import datetime
import abc
import numpy as np
@@ -28,6 +31,8 @@
_writer_extensions = ["xlsx", "xls", "xlsm"]
_writers = {}
+_readers = {}
+_reader_extensions = {}
def register_writer(klass):
@@ -68,12 +73,176 @@ def get_writer(engine_name):
raise ValueError("No Excel writer '%s'" % engine_name)
+class BaseFile(object):
+ """Base class for excel readers
+
+ A file class can be initialized even if the engine is not installed.
+ If the engine is not installed, io_class and workbook_factory are None.
+ When attempting to use open_workbook while workbook_factory is None, the
+ relevant ImportError is raised.
+ """
+
+ def __init__(self):
+ """Set the engine name, and extension. If the engine is not installed,
+ io_class and workbook_factory are both None.
+ """
+ self.io_class = None
+ self.workbook_factory = None
+
+ def open_workbook(self, *args, **kwargs):
+ """Explicitely load the engine again (and trigger an ImportError in the
+ process) if workbook_factory is set to None.
+ """
+ # try to load the engine again and raise import error if required
+ if self.workbook_factory is None:
+ self.load_engine()
+ # just in case the user passes an already opened workbook of io_class
+ if len(args) > 0 and isinstance(args[0], self.io_class):
+ self.book = args[0]
+ else:
+ self.book = self.workbook_factory(*args, **kwargs)
+ return self.book
+
+ def create_reader(self, io, engine=None):
+ """Create the appropriate reader object based on io and optionally
+ engine.
+
+ Paratemeters
+ ------------
+ io : string, file-like object or xlrd/ezodf workbook
+ If a string, expected to be a path to xls, xlsx, or ods file.
+ File-like objects or buffers are only supported for xlrd types.
+
+ engine: string, default None
+ If io is not a buffer or path, this must be set to identify io.
+ Acceptable values are None, xlrd, or ezodf
+
+ Returns
+ -------
+ engine : string
+ Engine used for reading the io
+
+ book : object
+ The spreadsheet book class created by the engine.
+ """
+
+ if engine is not None:
+ try:
+ reader = _readers[engine]
+ except KeyError:
+ msg = 'Excel reader engine "%s" is not implemented' % engine
+ raise NotImplementedError(msg)
+ # load_engine throws the relevant import error if not installed
+ reader.load_engine()
+ reader.open_workbook(io)
+ return reader
+
+ if isinstance(io, compat.string_types):
+ ext = io.split('.')[-1]
+ try:
+ reader = _reader_extensions[ext]
+ except KeyError:
+ msg = 'No reader implemented for extension "%s"' % ext
+ raise NotImplementedError(msg)
+ reader.load_engine()
+ if _is_url(io):
+ data = _urlopen(io).read()
+ reader.read_buffer(data)
+ else:
+ reader.open_workbook(io)
+ return reader
+
+ # try to determine the reader type based on properties of installed
+ # reader modules.
+ for engine, reader in compat.iteritems(_readers):
+ # only try to import readers that have not been imported before
+ if reader.io_class is None:
+ try:
+ reader.load_engine()
+ # if the reader is not installed, the user could not have
+ # passed the corresponding reader.io_class, so it is safe
+ # to assume that io is not the current reader
+ except ImportError:
+ continue
+ # Does the io type match the currently selected reader?
+ if isinstance(io, reader.io_class):
+ reader.engine = engine
+ reader.book = io
+ return reader
+ # xlrd has some additional/alternative reading mechanisms:
+ elif engine=='xlrd' and hasattr(io, "read"):
+ # N.B. xlrd.Book has a read attribute too
+ reader.read_buffer(io.read())
+ return reader
+
+ raise ValueError('Must explicitly set engine if not passing in buffer '
+ 'or path for io.')
+
+
+class XLRDFile(BaseFile):
+ """File reader class for MS Excel spreadsheets (depends on xlrd)
+ """
+ extensions = ['xls', 'xlsx', 'xlsm']
+ engine = 'xlrd'
+
+ def load_engine(self):
+ import xlrd # throw an ImportError if we need to
+ ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2]))
+ if ver < (0, 9): # pragma: no cover
+ raise ImportError("pandas requires xlrd >= 0.9.0 for excel "
+ "support, current version " + xlrd.__VERSION__)
+ else:
+ self.workbook_factory = xlrd.open_workbook
+ self.io_class = xlrd.Book
+
+ def read_buffer(self, data):
+ """Read from a buffer
+ """
+ return self.open_workbook(file_contents=data)
+
+
+class EZODFFile(BaseFile):
+ """File reader class for ODF spreadsheets (depends on ezodf)
+ """
+ extensions = ['ods']
+ engine = 'ezodf'
+
+ def load_engine(self):
+ import ezodf # throw an ImportError if we need to
+ self.workbook_factory = ezodf.opendoc
+ self.io_class = ezodf.document.PackagedDocument
+
+ def read_buffer(self, *args, **kwargs):
+ """
+ """
+ msg = 'Can not read ODF spreadsheet from a buffer or URL.'
+ raise NotImplementedError(msg)
+
+
+# register all supported readers
+def register_readers():
+ """
+ Establish which readers are supported and/or installed.
+ """
+
+ def populate(reader):
+ _readers[reader.engine] = reader
+ for ext in reader.extensions:
+ _reader_extensions[ext] = reader
+
+ populate(XLRDFile())
+ populate(EZODFFile())
+
+register_readers()
+
+
def read_excel(io, sheetname=0, **kwds):
- """Read an Excel table into a pandas DataFrame
+ """Read an Excel/ods table into a pandas DataFrame
Parameters
----------
- io : string, file-like object, or xlrd workbook.
+ io : string, file-like object, or xlrd workbook for MS Excel files. For an
+ ods file (Open Document Formant), string or ezodf workbook is required.
The string could be a URL. Valid URL schemes include http, ftp, s3,
and file. For file URLs, a host is expected. For instance, a local
file could be file://localhost/path/to/workbook.xlsx
@@ -106,7 +275,7 @@ def read_excel(io, sheetname=0, **kwds):
converters : dict, default None
Dict of functions for converting values in certain columns. Keys can
either be integers or column labels, values are functions that take one
- input argument, the Excel cell content, and return the transformed
+ input argument, the Excel/ods cell content, and return the transformed
content.
index_col : int, default None
Column to use as the row labels of the DataFrame. Pass None if
@@ -126,10 +295,10 @@ def read_excel(io, sheetname=0, **kwds):
Indicate number of NA values placed in non-numeric columns
engine: string, default None
If io is not a buffer or path, this must be set to identify io.
- Acceptable values are None or xlrd
+ Acceptable values are None, xlrd, or ezodf
convert_float : boolean, default True
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
- data will be read in as floats: Excel stores all numbers as floats
+ data will be read in as floats: Excel/ods stores all numbers as floats
internally
has_index_names : boolean, default False
True if the cols defined in index_col have an index name and are
@@ -151,47 +320,21 @@ def read_excel(io, sheetname=0, **kwds):
class ExcelFile(object):
"""
Class for parsing tabular excel sheets into DataFrame objects.
- Uses xlrd. See ExcelFile.parse for more documentation
+ Uses xlrd and/or ezodf. See ExcelFile.parse for more documentation
Parameters
----------
- io : string, file-like object or xlrd workbook
- If a string, expected to be a path to xls or xlsx file
+ io : string, file-like object or xlrd/ezodf workbook
+ If a string, expected to be a path to xls, xlsx, or ods file
engine: string, default None
If io is not a buffer or path, this must be set to identify io.
- Acceptable values are None or xlrd
+ Acceptable values are None, xlrd, or ezodf
"""
def __init__(self, io, **kwds):
-
- import xlrd # throw an ImportError if we need to
-
- ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2]))
- if ver < (0, 9): # pragma: no cover
- raise ImportError("pandas requires xlrd >= 0.9.0 for excel "
- "support, current version " + xlrd.__VERSION__)
-
self.io = io
engine = kwds.pop('engine', None)
-
- if engine is not None and engine != 'xlrd':
- raise ValueError("Unknown engine: %s" % engine)
-
- if isinstance(io, compat.string_types):
- if _is_url(io):
- data = _urlopen(io).read()
- self.book = xlrd.open_workbook(file_contents=data)
- else:
- self.book = xlrd.open_workbook(io)
- elif engine == 'xlrd' and isinstance(io, xlrd.Book):
- self.book = io
- elif not isinstance(io, xlrd.Book) and hasattr(io, "read"):
- # N.B. xlrd.Book has a read attribute too
- data = io.read()
- self.book = xlrd.open_workbook(file_contents=data)
- else:
- raise ValueError('Must explicitly set engine if not passing in'
- ' buffer or path for io.')
+ self.reader = BaseFile().create_reader(io, engine=engine)
def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
index_col=None, parse_cols=None, parse_dates=False,
@@ -270,18 +413,25 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
if skipfooter is not None:
skip_footer = skipfooter
- return self._parse_excel(sheetname=sheetname, header=header,
- skiprows=skiprows,
- index_col=index_col,
- has_index_names=has_index_names,
- parse_cols=parse_cols,
- parse_dates=parse_dates,
- date_parser=date_parser, na_values=na_values,
- thousands=thousands, chunksize=chunksize,
- skip_footer=skip_footer,
- convert_float=convert_float,
- converters=converters,
- **kwds)
+ if self.reader.engine == 'ezodf':
+ parser = self._parse_ods
+ elif self.reader.engine == 'xlrd':
+ parser = self._parse_excel
+ else:
+ raise ValueError('Engine is not specified.')
+
+ return parser(sheetname=sheetname, header=header,
+ skiprows=skiprows,
+ index_col=index_col,
+ has_index_names=has_index_names,
+ parse_cols=parse_cols,
+ parse_dates=parse_dates,
+ date_parser=date_parser, na_values=na_values,
+ thousands=thousands, chunksize=chunksize,
+ skip_footer=skip_footer,
+ convert_float=convert_float,
+ converters=converters,
+ **kwds)
def _should_parse(self, i, parse_cols):
@@ -326,9 +476,9 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
XL_CELL_ERROR, XL_CELL_BOOLEAN,
XL_CELL_NUMBER)
- epoch1904 = self.book.datemode
+ epoch1904 = self.reader.book.datemode
- def _parse_cell(cell_contents,cell_typ):
+ def _parse_cell(cell_contents, cell_typ):
"""converts the contents of the cell into a pandas
appropriate object"""
@@ -377,7 +527,7 @@ def _parse_cell(cell_contents,cell_typ):
ret_dict = False
- #Keep sheetname to maintain backwards compatibility.
+ # Keep sheetname to maintain backwards compatibility.
if isinstance(sheetname, list):
sheets = sheetname
ret_dict = True
@@ -387,7 +537,7 @@ def _parse_cell(cell_contents,cell_typ):
else:
sheets = [sheetname]
- #handle same-type duplicates.
+ # handle same-type duplicates.
sheets = list(set(sheets))
output = {}
@@ -397,9 +547,9 @@ def _parse_cell(cell_contents,cell_typ):
print("Reading sheet %s" % asheetname)
if isinstance(asheetname, compat.string_types):
- sheet = self.book.sheet_by_name(asheetname)
+ sheet = self.reader.book.sheet_by_name(asheetname)
else: # assume an integer if not a string
- sheet = self.book.sheet_by_index(asheetname)
+ sheet = self.reader.book.sheet_by_index(asheetname)
data = []
should_parse = {}
@@ -412,7 +562,7 @@ def _parse_cell(cell_contents,cell_typ):
should_parse[j] = self._should_parse(j, parse_cols)
if parse_cols is None or should_parse[j]:
- row.append(_parse_cell(value,typ))
+ row.append(_parse_cell(value, typ))
data.append(row)
if sheet.nrows == 0:
@@ -439,10 +589,195 @@ def _parse_cell(cell_contents,cell_typ):
else:
return output[asheetname]
+ def _parse_ods(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
+ index_col=None, has_index_names=None, parse_cols=None,
+ parse_dates=False, date_parser=None, na_values=None,
+ thousands=None, chunksize=None, convert_float=True,
+ verbose=False, **kwds):
+ # adds support for parsing ODS files, see PR #9070
+
+ def _parse_cell(cell):
+ """converts the contents of the cell into a pandas
+ appropriate object"""
+ if isinstance(cell.value, float):
+ value = cell.value
+ if convert_float:
+ # GH5394 - Excel and ODS 'numbers' are always floats
+ # it's a minimal perf hit and less suprising
+ # FIXME: this goes wrong when int(cell.value) returns
+ # a long (>1e18)
+ val = int(cell.value)
+ if val == cell.value:
+ value = val
+ elif isinstance(cell.value, compat.string_types):
+ typ = cell.value_type
+ if typ == 'date' or typ == 'time':
+ value = self._parse_datetime(cell)
+ else:
+ value = cell.value
+ elif isinstance(cell.value, bool):
+ value = cell.value
+ # empty cells have None as value, type, currency, formula.
+ # xlrd assigns empty string to empty cells, ezodf assigns None
+ # test_excel.ExcelReaderTests.test_reader_converters expects empty
+ # cells to be an empty string
+ elif isinstance(cell.value, type(None)):
+ value = ''
+ else:
+ value = np.nan
+ return value
+
+ ret_dict = False
+ # find numbers for the date/time object conversion
+ self.regex = re.compile('[[0-9]*[\\.[0-9]+]*]*')
+
+ # Keep sheetname to maintain backwards compatibility.
+ if isinstance(sheetname, list):
+ sheets = sheetname
+ ret_dict = True
+ elif sheetname is None:
+ sheets = self.sheet_names
+ ret_dict = True
+ else:
+ sheets = [sheetname]
+
+ # handle same-type duplicates.
+ sheets = list(set(sheets))
+
+ output = {}
+
+ for asheetname in sheets:
+ if verbose:
+ print("Reading sheet %s" % asheetname)
+
+ # sheetname can be index or string
+ sheet = self.reader.book.sheets[asheetname]
+
+ data = []
+ should_parse = {}
+ for i in range(sheet.nrows()):
+ row = []
+ for j, cell in enumerate(sheet.row(i)):
+
+ if parse_cols is not None and j not in should_parse:
+ should_parse[j] = self._should_parse(j, parse_cols)
+
+ if parse_cols is None or should_parse[j]:
+ row.append(_parse_cell(cell))
+
+ data.append(row)
+
+ parser = TextParser(data, header=header, index_col=index_col,
+ has_index_names=has_index_names,
+ na_values=na_values,
+ thousands=thousands,
+ parse_dates=parse_dates,
+ date_parser=date_parser,
+ skiprows=skiprows,
+ skip_footer=skip_footer,
+ chunksize=chunksize,
+ **kwds)
+ output[asheetname] = parser.read()
+
+ if ret_dict:
+ return output
+ else:
+ return output[asheetname]
+
+ def _parse_datetime(self, cell):
+ """Parse the date or time from on ods cell to a datetime object.
+ Formats returned by ezodf are documented here:
+ https://pythonhosted.org/ezodf/tableobjects.html#cell-class
+
+ Because time cells can also be timedeltas, all time fields that exceed
+ 23 hours are converted to a timedelta object.
+
+ Date string value formats: 'yyyy-mm-dd' or 'yyyy-mm-ddThh:mm:ss'
+
+ Time string value format: 'PThhHmmMss,ffffS'
+ """
+
+ def _sec_split_micro(seconds):
+ """Split a floatingpoint second value into an integer second value
+ and an integer microsecond value.
+ """
+ sec = float(seconds)
+ sec_i = int(sec)
+ microsec = int(round((sec - sec_i)*1e6, 0))
+ return sec_i, microsec
+
+ def _timedelta(items):
+ """
+ Possible formats for formulas are:
+ 'of:=TIME(%H;%M;%S)'
+ 'of:=TIME(%H;%M;%S.%fS)'
+ Possible formats for values are:
+ 'PT%HH%MM%S.%fS'
+ 'PT%HH%MM%SS'
+ """
+ hours, minutes, seconds = items
+ return datetime.timedelta(hours=int(hours), minutes=int(minutes),
+ seconds=float(seconds))
+
+ def _time(items):
+ hours, minutes, seconds = items
+ sec_i, microsec = _sec_split_micro(seconds)
+ return datetime.time(int(hours), int(minutes), sec_i, microsec)
+
+ def _datetime(items):
+ """
+ Possible formats for values are:
+ '%Y-%m-%d'
+ '%Y-%m-%dT%H:%M:%S'
+ '%Y-%m-%dT%H:%M:%S.%f'
+ """
+
+ if len(items) == 3:
+ year, month, day = [int(k) for k in items]
+ return datetime.datetime(year, month, day)
+ else:
+ year, month, day, hours, minutes = [int(k) for k in items[:-1]]
+ # seconds can be a float, convert to microseconds
+ sec_i, microsec = _sec_split_micro(items[-1])
+ return datetime.datetime(year, month, day, hours, minutes,
+ sec_i, microsec)
+
+ # Only consider the value fields, formula's can contain just cell refs.
+ # Note that cell formatting determines if a value type is time, date or
+ # just a number. By using the cell value, cell type is consistent with
+ # what the user will see/format in LibreOffice
+ items = self.regex.findall(cell.value)
+ if cell.value_type == 'date':
+ value = _datetime(items)
+ else:
+ try:
+ # will fail when hours > 23, which is possible in LibreOffice
+ value = _time(items)
+ except ValueError:
+ value = _timedelta(items)
+
+ return value
+
+ def _print_ods_cellinfo(self, cell):
+ """Convienent for debugging purposes: print all ods cell data.
+ Cell attributes are documented here:
+ https://pythonhosted.org/ezodf/tableobjects.html#id2
+ """
+ print(' plaintext:', cell.plaintext()) # no formatting
+ # formatted, but what is difference with value?
+ print('display_form:', cell.display_form) # format, ?=plaintext
+ print(' value:', cell.value) # data handled
+ print(' value_type:', cell.value_type) # data type
+ print(' formula:', cell.formula)
+ print(' currency:', cell.currency)
@property
def sheet_names(self):
- return self.book.sheet_names()
+ if self.reader.engine == 'ezodf':
+ # book.sheet.names() is a generator for ezodf
+ return [sheetname for sheetname in self.reader.book.sheets.names()]
+ else:
+ return self.reader.book.sheet_names()
def close(self):
"""close io if necessary"""
@@ -1174,7 +1509,7 @@ class _XlwtWriter(ExcelWriter):
def __init__(self, path, engine=None, encoding=None, **engine_kwargs):
# Use the xlwt module as the Excel writer.
import xlwt
- engine_kwargs['engine'] = engine
+
super(_XlwtWriter, self).__init__(path, **engine_kwargs)
if encoding is None:
diff --git a/pandas/io/tests/data/test1.ods b/pandas/io/tests/data/test1.ods
new file mode 100644
index 0000000000000..d07a979deb576
Binary files /dev/null and b/pandas/io/tests/data/test1.ods differ
diff --git a/pandas/io/tests/data/test.xls b/pandas/io/tests/data/test1.xls
similarity index 100%
rename from pandas/io/tests/data/test.xls
rename to pandas/io/tests/data/test1.xls
diff --git a/pandas/io/tests/data/test.xlsm b/pandas/io/tests/data/test1.xlsm
similarity index 100%
rename from pandas/io/tests/data/test.xlsm
rename to pandas/io/tests/data/test1.xlsm
diff --git a/pandas/io/tests/data/test.xlsx b/pandas/io/tests/data/test1.xlsx
similarity index 100%
rename from pandas/io/tests/data/test.xlsx
rename to pandas/io/tests/data/test1.xlsx
diff --git a/pandas/io/tests/data/test2.ods b/pandas/io/tests/data/test2.ods
new file mode 100644
index 0000000000000..35bfff5220245
Binary files /dev/null and b/pandas/io/tests/data/test2.ods differ
diff --git a/pandas/io/tests/data/test2.xlsm b/pandas/io/tests/data/test2.xlsm
new file mode 100644
index 0000000000000..31cfba7ede082
Binary files /dev/null and b/pandas/io/tests/data/test2.xlsm differ
diff --git a/pandas/io/tests/data/test2.xlsx b/pandas/io/tests/data/test2.xlsx
index 441db5e55e666..94dd951e0bb84 100644
Binary files a/pandas/io/tests/data/test2.xlsx and b/pandas/io/tests/data/test2.xlsx differ
diff --git a/pandas/io/tests/data/test3.ods b/pandas/io/tests/data/test3.ods
new file mode 100644
index 0000000000000..4e072a231bccf
Binary files /dev/null and b/pandas/io/tests/data/test3.ods differ
diff --git a/pandas/io/tests/data/test3.xlsm b/pandas/io/tests/data/test3.xlsm
new file mode 100644
index 0000000000000..54b7ef456a9ea
Binary files /dev/null and b/pandas/io/tests/data/test3.xlsm differ
diff --git a/pandas/io/tests/data/test3.xlsx b/pandas/io/tests/data/test3.xlsx
new file mode 100644
index 0000000000000..c16755c25fabd
Binary files /dev/null and b/pandas/io/tests/data/test3.xlsx differ
diff --git a/pandas/io/tests/data/test4.ods b/pandas/io/tests/data/test4.ods
new file mode 100644
index 0000000000000..71a12f04674e9
Binary files /dev/null and b/pandas/io/tests/data/test4.ods differ
diff --git a/pandas/io/tests/data/test4.xls b/pandas/io/tests/data/test4.xls
new file mode 100644
index 0000000000000..0e6f4331e2547
Binary files /dev/null and b/pandas/io/tests/data/test4.xls differ
diff --git a/pandas/io/tests/data/test4.xlsm b/pandas/io/tests/data/test4.xlsm
new file mode 100644
index 0000000000000..52328c7b28be9
Binary files /dev/null and b/pandas/io/tests/data/test4.xlsm differ
diff --git a/pandas/io/tests/data/test4.xlsx b/pandas/io/tests/data/test4.xlsx
new file mode 100644
index 0000000000000..441db5e55e666
Binary files /dev/null and b/pandas/io/tests/data/test4.xlsx differ
diff --git a/pandas/io/tests/data/test_converters.ods b/pandas/io/tests/data/test_converters.ods
new file mode 100644
index 0000000000000..3ebb8423daa89
Binary files /dev/null and b/pandas/io/tests/data/test_converters.ods differ
diff --git a/pandas/io/tests/data/test_converters.xlsm b/pandas/io/tests/data/test_converters.xlsm
new file mode 100644
index 0000000000000..eaf0b1d0219c5
Binary files /dev/null and b/pandas/io/tests/data/test_converters.xlsm differ
diff --git a/pandas/io/tests/data/test_datetime.ods b/pandas/io/tests/data/test_datetime.ods
new file mode 100644
index 0000000000000..165202a3fb731
Binary files /dev/null and b/pandas/io/tests/data/test_datetime.ods differ
diff --git a/pandas/io/tests/data/test_multisheet.ods b/pandas/io/tests/data/test_multisheet.ods
new file mode 100644
index 0000000000000..275f5350fe853
Binary files /dev/null and b/pandas/io/tests/data/test_multisheet.ods differ
diff --git a/pandas/io/tests/data/test_multisheet.xls b/pandas/io/tests/data/test_multisheet.xls
new file mode 100644
index 0000000000000..fa37723fcdefb
Binary files /dev/null and b/pandas/io/tests/data/test_multisheet.xls differ
diff --git a/pandas/io/tests/data/test_multisheet.xlsm b/pandas/io/tests/data/test_multisheet.xlsm
new file mode 100644
index 0000000000000..694f8e07d5e29
Binary files /dev/null and b/pandas/io/tests/data/test_multisheet.xlsm differ
diff --git a/pandas/io/tests/data/test_types.ods b/pandas/io/tests/data/test_types.ods
new file mode 100644
index 0000000000000..bcf5433102f78
Binary files /dev/null and b/pandas/io/tests/data/test_types.ods differ
diff --git a/pandas/io/tests/data/test_types.xlsm b/pandas/io/tests/data/test_types.xlsm
new file mode 100644
index 0000000000000..c66fdc82dfb67
Binary files /dev/null and b/pandas/io/tests/data/test_types.xlsm differ
diff --git a/pandas/io/tests/data/test_types_datetime.ods b/pandas/io/tests/data/test_types_datetime.ods
new file mode 100644
index 0000000000000..b010d02fb9949
Binary files /dev/null and b/pandas/io/tests/data/test_types_datetime.ods differ
diff --git a/pandas/io/tests/data/times_1900.xlsm b/pandas/io/tests/data/times_1900.xlsm
new file mode 100644
index 0000000000000..1ffdbe223453b
Binary files /dev/null and b/pandas/io/tests/data/times_1900.xlsm differ
diff --git a/pandas/io/tests/data/times_1900.xlsx b/pandas/io/tests/data/times_1900.xlsx
new file mode 100644
index 0000000000000..3702289b256fd
Binary files /dev/null and b/pandas/io/tests/data/times_1900.xlsx differ
diff --git a/pandas/io/tests/data/times_1904.xlsm b/pandas/io/tests/data/times_1904.xlsm
new file mode 100644
index 0000000000000..e884eca1e7c74
Binary files /dev/null and b/pandas/io/tests/data/times_1904.xlsm differ
diff --git a/pandas/io/tests/data/times_1904.xlsx b/pandas/io/tests/data/times_1904.xlsx
new file mode 100644
index 0000000000000..1a13468e59d1c
Binary files /dev/null and b/pandas/io/tests/data/times_1904.xlsx differ
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
index 83db59f9d9029..4a3d6d18616c0 100644
--- a/pandas/io/tests/test_excel.py
+++ b/pandas/io/tests/test_excel.py
@@ -1,7 +1,8 @@
# pylint: disable=E1101
from pandas.compat import u, range, map, openpyxl_compat, BytesIO, iteritems
-from datetime import datetime, date, time
+from pandas import compat
+from datetime import datetime, date, time, timedelta
import sys
import os
from distutils.version import LooseVersion
@@ -58,6 +59,13 @@ def _skip_if_no_xlsxwriter():
raise nose.SkipTest('xlsxwriter not installed, skipping')
+def _skip_if_no_ezodf():
+ try:
+ import ezodf # NOQA
+ except ImportError:
+ raise nose.SkipTest('ezodf not installed, skipping')
+
+
def _skip_if_no_excelsuite():
_skip_if_no_xlrd()
_skip_if_no_xlwt()
@@ -76,11 +84,6 @@ def _skip_if_no_excelsuite():
class SharedItems(object):
def setUp(self):
self.dirpath = tm.get_data_path()
- self.csv1 = os.path.join(self.dirpath, 'test1.csv')
- self.csv2 = os.path.join(self.dirpath, 'test2.csv')
- self.xls1 = os.path.join(self.dirpath, 'test.xls')
- self.xlsx1 = os.path.join(self.dirpath, 'test.xlsx')
- self.multisheet = os.path.join(self.dirpath, 'test_multisheet.xlsx')
self.frame = _frame.copy()
self.frame2 = _frame2.copy()
self.tsframe = _tsframe.copy()
@@ -91,266 +94,176 @@ def read_csv(self, *args, **kwds):
kwds['engine'] = 'python'
return read_csv(*args, **kwds)
+ def get_data(self, basename, csv=True):
+ """
+ Return a DataFrame as read by the Python csv engine and a DataFrame
+ as read by the ExcelFile engine. Test data path is defined by
+ pandas.util.testing.get_data_path()
+
+ Parameters
+ ----------
+
+ basename : str
+ File base name, excluding file extension.
+
+ csv : boolean, default=True
+ When True, basename.csv is returned
+ """
+
+ excel = ExcelFile(os.path.join(self.dirpath, basename + self.ext))
+ if csv:
+ # the reference is obtained form read_csv with Python engine
+ pref = os.path.join(self.dirpath, basename + '.csv')
+ dfref = self.read_csv(pref, index_col=0, parse_dates=True)
+ return dfref, excel
+ else:
+ return excel
+
+
+class ReadingTestsBase(SharedItems):
+ # This is based on ExcelWriterBase
+ #
+ # Base class for test cases to run with different Excel readers.
+ # To add a reader test, define the following:
+ # 1. A check_skip function that skips your tests if your reader isn't
+ # installed.
+ # 2. Add a property ext, which is the file extension that your reader
+ # reades from. (needs to start with '.' so it's a valid path)
+ # 3. Add a property engine_name, which is the name of the reader class.
+ # For the reader this is not used for anything at the moment.
+
+ def setUp(self):
+ self.check_skip()
+ super(ReadingTestsBase, self).setUp()
-class ExcelReaderTests(SharedItems, tm.TestCase):
def test_parse_cols_int(self):
- _skip_if_no_openpyxl()
- _skip_if_no_xlrd()
- suffix = ['xls', 'xlsx', 'xlsm']
-
- for s in suffix:
- pth = os.path.join(self.dirpath, 'test.%s' % s)
- xls = ExcelFile(pth)
- df = xls.parse('Sheet1', index_col=0, parse_dates=True,
- parse_cols=3)
- df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True)
- df2 = df2.reindex(columns=['A', 'B', 'C'])
- df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
- parse_dates=True, parse_cols=3)
- # TODO add index to xls file)
- tm.assert_frame_equal(df, df2, check_names=False)
- tm.assert_frame_equal(df3, df2, check_names=False)
+ dfref, excel = self.get_data('test1')
+ dfref = dfref.reindex(columns=['A', 'B', 'C'])
+ df1 = excel.parse('Sheet1', index_col=0, parse_dates=True,
+ parse_cols=3)
+ df2 = excel.parse('Sheet2', skiprows=[1], index_col=0,
+ parse_dates=True, parse_cols=3)
+ # TODO add index to xls file)
+ tm.assert_frame_equal(df1, dfref, check_names=False)
+ tm.assert_frame_equal(df2, dfref, check_names=False)
def test_parse_cols_list(self):
- _skip_if_no_openpyxl()
- _skip_if_no_xlrd()
- suffix = ['xls', 'xlsx', 'xlsm']
-
- for s in suffix:
- pth = os.path.join(self.dirpath, 'test.%s' % s)
- xls = ExcelFile(pth)
- df = xls.parse('Sheet1', index_col=0, parse_dates=True,
- parse_cols=[0, 2, 3])
- df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True)
- df2 = df2.reindex(columns=['B', 'C'])
- df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
- parse_dates=True,
- parse_cols=[0, 2, 3])
- # TODO add index to xls file)
- tm.assert_frame_equal(df, df2, check_names=False)
- tm.assert_frame_equal(df3, df2, check_names=False)
+ dfref, excel = self.get_data('test1')
+ dfref = dfref.reindex(columns=['B', 'C'])
+ df1 = excel.parse('Sheet1', index_col=0, parse_dates=True,
+ parse_cols=[0, 2, 3])
+ df2 = excel.parse('Sheet2', skiprows=[1], index_col=0,
+ parse_dates=True,
+ parse_cols=[0, 2, 3])
+ # TODO add index to xls file)
+ tm.assert_frame_equal(df1, dfref, check_names=False)
+ tm.assert_frame_equal(df2, dfref, check_names=False)
def test_parse_cols_str(self):
- _skip_if_no_openpyxl()
- _skip_if_no_xlrd()
- suffix = ['xls', 'xlsx', 'xlsm']
-
- for s in suffix:
-
- pth = os.path.join(self.dirpath, 'test.%s' % s)
- xls = ExcelFile(pth)
-
- df = xls.parse('Sheet1', index_col=0, parse_dates=True,
- parse_cols='A:D')
- df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
- df2 = df2.reindex(columns=['A', 'B', 'C'])
- df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
- parse_dates=True, parse_cols='A:D')
- # TODO add index to xls, read xls ignores index name ?
- tm.assert_frame_equal(df, df2, check_names=False)
- tm.assert_frame_equal(df3, df2, check_names=False)
- del df, df2, df3
-
- df = xls.parse('Sheet1', index_col=0, parse_dates=True,
- parse_cols='A,C,D')
- df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
- df2 = df2.reindex(columns=['B', 'C'])
- df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
- parse_dates=True,
- parse_cols='A,C,D')
- # TODO add index to xls file
- tm.assert_frame_equal(df, df2, check_names=False)
- tm.assert_frame_equal(df3, df2, check_names=False)
- del df, df2, df3
-
- df = xls.parse('Sheet1', index_col=0, parse_dates=True,
- parse_cols='A,C:D')
- df2 = read_csv(self.csv1, index_col=0, parse_dates=True)
- df2 = df2.reindex(columns=['B', 'C'])
- df3 = xls.parse('Sheet2', skiprows=[1], index_col=0,
- parse_dates=True,
- parse_cols='A,C:D')
- tm.assert_frame_equal(df, df2, check_names=False)
- tm.assert_frame_equal(df3, df2, check_names=False)
+ dfref, excel = self.get_data('test1')
+
+ df1 = dfref.reindex(columns=['A', 'B', 'C'])
+ df2 = excel.parse('Sheet1', index_col=0, parse_dates=True,
+ parse_cols='A:D')
+ df3 = excel.parse('Sheet2', skiprows=[1], index_col=0,
+ parse_dates=True, parse_cols='A:D')
+ # TODO add index to xls, read xls ignores index name ?
+ tm.assert_frame_equal(df2, df1, check_names=False)
+ tm.assert_frame_equal(df3, df1, check_names=False)
+
+ df1 = dfref.reindex(columns=['B', 'C'])
+ df2 = excel.parse('Sheet1', index_col=0, parse_dates=True,
+ parse_cols='A,C,D')
+ df3 = excel.parse('Sheet2', skiprows=[1], index_col=0,
+ parse_dates=True,
+ parse_cols='A,C,D')
+ # TODO add index to xls file
+ tm.assert_frame_equal(df2, df1, check_names=False)
+ tm.assert_frame_equal(df3, df1, check_names=False)
+
+ df1 = dfref.reindex(columns=['B', 'C'])
+ df2 = excel.parse('Sheet1', index_col=0, parse_dates=True,
+ parse_cols='A,C:D')
+ df3 = excel.parse('Sheet2', skiprows=[1], index_col=0,
+ parse_dates=True,
+ parse_cols='A,C:D')
+ tm.assert_frame_equal(df2, df1, check_names=False)
+ tm.assert_frame_equal(df3, df1, check_names=False)
def test_excel_stop_iterator(self):
- _skip_if_no_xlrd()
- excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xls'))
- parsed = excel_data.parse('Sheet1')
+ excel = self.get_data('test2', csv=False)
+
+ parsed = excel.parse('Sheet1')
expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1'])
tm.assert_frame_equal(parsed, expected)
def test_excel_cell_error_na(self):
- _skip_if_no_xlrd()
- excel_data = ExcelFile(os.path.join(self.dirpath, 'test3.xls'))
- parsed = excel_data.parse('Sheet1')
+ excel = self.get_data('test3', csv=False)
+
+ parsed = excel.parse('Sheet1')
expected = DataFrame([[np.nan]], columns=['Test'])
tm.assert_frame_equal(parsed, expected)
def test_excel_passes_na(self):
- _skip_if_no_xlrd()
- excel_data = ExcelFile(os.path.join(self.dirpath, 'test2.xlsx'))
- parsed = excel_data.parse('Sheet1', keep_default_na=False,
- na_values=['apple'])
+ excel = self.get_data('test4', csv=False)
+
+ parsed = excel.parse('Sheet1', keep_default_na=False,
+ na_values=['apple'])
expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']],
columns=['Test'])
tm.assert_frame_equal(parsed, expected)
- parsed = excel_data.parse('Sheet1', keep_default_na=True,
- na_values=['apple'])
+ parsed = excel.parse('Sheet1', keep_default_na=True,
+ na_values=['apple'])
expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']],
columns=['Test'])
tm.assert_frame_equal(parsed, expected)
- def check_excel_table_sheet_by_index(self, filename, csvfile):
- import xlrd
-
- pth = os.path.join(self.dirpath, filename)
- xls = ExcelFile(pth)
- df = xls.parse(0, index_col=0, parse_dates=True)
- df2 = self.read_csv(csvfile, index_col=0, parse_dates=True)
- df3 = xls.parse(1, skiprows=[1], index_col=0, parse_dates=True)
- tm.assert_frame_equal(df, df2, check_names=False)
- tm.assert_frame_equal(df3, df2, check_names=False)
-
- df4 = xls.parse(0, index_col=0, parse_dates=True, skipfooter=1)
- df5 = xls.parse(0, index_col=0, parse_dates=True, skip_footer=1)
- tm.assert_frame_equal(df4, df.ix[:-1])
- tm.assert_frame_equal(df4, df5)
-
- self.assertRaises(xlrd.XLRDError, xls.parse, 'asdf')
-
def test_excel_table_sheet_by_index(self):
- _skip_if_no_xlrd()
- for filename, csvfile in [(self.xls1, self.csv1),
- (self.xlsx1, self.csv1)]:
- self.check_excel_table_sheet_by_index(filename, csvfile)
-
- def test_excel_table(self):
- _skip_if_no_xlrd()
-
- pth = os.path.join(self.dirpath, 'test.xls')
- xls = ExcelFile(pth)
- df = xls.parse('Sheet1', index_col=0, parse_dates=True)
- df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True)
- df3 = xls.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True)
- tm.assert_frame_equal(df, df2, check_names=False)
- tm.assert_frame_equal(df3, df2, check_names=False)
-
- df4 = xls.parse('Sheet1', index_col=0, parse_dates=True,
- skipfooter=1)
- df5 = xls.parse('Sheet1', index_col=0, parse_dates=True,
- skip_footer=1)
- tm.assert_frame_equal(df4, df.ix[:-1])
- tm.assert_frame_equal(df4, df5)
-
- def test_excel_read_buffer(self):
- _skip_if_no_xlrd()
- _skip_if_no_openpyxl()
-
- pth = os.path.join(self.dirpath, 'test.xls')
- f = open(pth, 'rb')
- xls = ExcelFile(f)
- # it works
- xls.parse('Sheet1', index_col=0, parse_dates=True)
-
- pth = os.path.join(self.dirpath, 'test.xlsx')
- f = open(pth, 'rb')
- xl = ExcelFile(f)
- xl.parse('Sheet1', index_col=0, parse_dates=True)
-
- def test_read_xlrd_Book(self):
- _skip_if_no_xlrd()
- _skip_if_no_xlwt()
-
- import xlrd
-
- df = self.frame
-
- with ensure_clean('.xls') as pth:
- df.to_excel(pth, "SheetA")
- book = xlrd.open_workbook(pth)
-
- with ExcelFile(book, engine="xlrd") as xl:
- result = xl.parse("SheetA")
- tm.assert_frame_equal(df, result)
- result = read_excel(book, sheetname="SheetA", engine="xlrd")
- tm.assert_frame_equal(df, result)
+ dfref, excel = self.get_data('test1')
- @tm.network
- def test_read_from_http_url(self):
- _skip_if_no_xlrd()
+ df1 = excel.parse(0, index_col=0, parse_dates=True)
+ df2 = excel.parse(1, skiprows=[1], index_col=0, parse_dates=True)
+ tm.assert_frame_equal(df1, dfref, check_names=False)
+ tm.assert_frame_equal(df2, dfref, check_names=False)
- url = ('https://raw.github.com/pydata/pandas/master/'
- 'pandas/io/tests/data/test.xlsx')
- url_table = read_excel(url)
- dirpath = tm.get_data_path()
- localtable = os.path.join(dirpath, 'test.xlsx')
- local_table = read_excel(localtable)
- tm.assert_frame_equal(url_table, local_table)
-
- @slow
- def test_read_from_file_url(self):
- _skip_if_no_xlrd()
+ df3 = excel.parse(0, index_col=0, parse_dates=True, skipfooter=1)
+ df4 = excel.parse(0, index_col=0, parse_dates=True, skip_footer=1)
+ tm.assert_frame_equal(df3, df1.ix[:-1])
+ tm.assert_frame_equal(df3, df4)
- # FILE
- if sys.version_info[:2] < (2, 6):
- raise nose.SkipTest("file:// not supported with Python < 2.6")
- dirpath = tm.get_data_path()
- localtable = os.path.join(dirpath, 'test.xlsx')
- local_table = read_excel(localtable)
-
- try:
- url_table = read_excel('file://localhost/' + localtable)
- except URLError:
- # fails on some systems
- raise nose.SkipTest("failing on %s" %
- ' '.join(platform.uname()).strip())
-
- tm.assert_frame_equal(url_table, local_table)
-
- def test_xlsx_table(self):
- _skip_if_no_xlrd()
- _skip_if_no_openpyxl()
-
- pth = os.path.join(self.dirpath, 'test.xlsx')
- xlsx = ExcelFile(pth)
- df = xlsx.parse('Sheet1', index_col=0, parse_dates=True)
- df2 = self.read_csv(self.csv1, index_col=0, parse_dates=True)
- df3 = xlsx.parse('Sheet2', skiprows=[1], index_col=0, parse_dates=True)
-
- # TODO add index to xlsx file
- tm.assert_frame_equal(df, df2, check_names=False)
- tm.assert_frame_equal(df3, df2, check_names=False)
+ if self.ext == '.ods':
+ self.assertRaises(KeyError, excel.parse, 'asdf')
+ else:
+ import xlrd
+ self.assertRaises(xlrd.XLRDError, excel.parse, 'asdf')
- df4 = xlsx.parse('Sheet1', index_col=0, parse_dates=True,
- skipfooter=1)
- df5 = xlsx.parse('Sheet1', index_col=0, parse_dates=True,
- skip_footer=1)
- tm.assert_frame_equal(df4, df.ix[:-1])
- tm.assert_frame_equal(df4, df5)
+ def test_excel_table(self):
- def test_reader_closes_file(self):
- _skip_if_no_xlrd()
- _skip_if_no_openpyxl()
+ dfref, excel = self.get_data('test1')
- pth = os.path.join(self.dirpath, 'test.xlsx')
- f = open(pth, 'rb')
- with ExcelFile(f) as xlsx:
- # parses okay
- xlsx.parse('Sheet1', index_col=0)
+ df1 = excel.parse('Sheet1', index_col=0, parse_dates=True)
+ df2 = excel.parse('Sheet2', skiprows=[1], index_col=0,
+ parse_dates=True)
+ # TODO add index to file
+ tm.assert_frame_equal(df1, dfref, check_names=False)
+ tm.assert_frame_equal(df2, dfref, check_names=False)
- self.assertTrue(f.closed)
+ df3 = excel.parse('Sheet1', index_col=0, parse_dates=True,
+ skipfooter=1)
+ df4 = excel.parse('Sheet1', index_col=0, parse_dates=True,
+ skip_footer=1)
+ tm.assert_frame_equal(df3, df1.ix[:-1])
+ tm.assert_frame_equal(df3, df4)
def test_reader_special_dtypes(self):
- _skip_if_no_xlrd()
expected = DataFrame.from_items([
("IntCol", [1, 2, -3, 4, 0]),
@@ -364,44 +277,40 @@ def test_reader_special_dtypes(self):
datetime(2015, 3, 14)])
])
- xlsx_path = os.path.join(self.dirpath, 'test_types.xlsx')
- xls_path = os.path.join(self.dirpath, 'test_types.xls')
+ pth = os.path.join(self.dirpath, 'test_types' + self.ext)
# should read in correctly and infer types
- for path in (xls_path, xlsx_path):
- actual = read_excel(path, 'Sheet1')
- tm.assert_frame_equal(actual, expected)
+ actual = read_excel(pth, 'Sheet1')
+ tm.assert_frame_equal(actual, expected)
# if not coercing number, then int comes in as float
float_expected = expected.copy()
float_expected["IntCol"] = float_expected["IntCol"].astype(float)
float_expected.loc[1, "Str2Col"] = 3.0
- for path in (xls_path, xlsx_path):
- actual = read_excel(path, 'Sheet1', convert_float=False)
- tm.assert_frame_equal(actual, float_expected)
+ actual = read_excel(pth, 'Sheet1', convert_float=False)
+ tm.assert_frame_equal(actual, float_expected)
# check setting Index (assuming xls and xlsx are the same here)
for icol, name in enumerate(expected.columns):
- actual = read_excel(xlsx_path, 'Sheet1', index_col=icol)
- actual2 = read_excel(xlsx_path, 'Sheet1', index_col=name)
+ actual = read_excel(pth, 'Sheet1', index_col=icol)
exp = expected.set_index(name)
tm.assert_frame_equal(actual, exp)
- tm.assert_frame_equal(actual2, exp)
# convert_float and converters should be different but both accepted
expected["StrCol"] = expected["StrCol"].apply(str)
- actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str})
+ actual = read_excel(pth, 'Sheet1', converters={"StrCol": str})
tm.assert_frame_equal(actual, expected)
no_convert_float = float_expected.copy()
no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str)
- actual = read_excel(xlsx_path, 'Sheet1', converters={"StrCol": str},
+ actual = read_excel(pth, 'Sheet1', converters={"StrCol": str},
convert_float=False)
tm.assert_frame_equal(actual, no_convert_float)
# GH8212 - support for converters and missing values
def test_reader_converters(self):
- _skip_if_no_xlrd()
+
+ pth = os.path.join(self.dirpath, 'test_converters' + self.ext)
expected = DataFrame.from_items([
("IntCol", [1, 2, -3, -1000, 0]),
@@ -416,48 +325,166 @@ def test_reader_converters(self):
3: lambda x: str(x) if x else '',
}
- xlsx_path = os.path.join(self.dirpath, 'test_converters.xlsx')
- xls_path = os.path.join(self.dirpath, 'test_converters.xls')
-
# should read in correctly and set types of single cells (not array dtypes)
- for path in (xls_path, xlsx_path):
- actual = read_excel(path, 'Sheet1', converters=converters)
- tm.assert_frame_equal(actual, expected)
+ actual = read_excel(pth, 'Sheet1', converters=converters)
+ tm.assert_frame_equal(actual, expected)
def test_reading_all_sheets(self):
# Test reading all sheetnames by setting sheetname to None,
# Ensure a dict is returned.
# See PR #9450
-
- _skip_if_no_xlrd()
-
- dfs = read_excel(self.multisheet,sheetname=None)
- expected_keys = ['Alpha','Beta','Charlie']
- tm.assert_contains_all(expected_keys,dfs.keys())
+ pth = os.path.join(self.dirpath, 'test_multisheet' + self.ext)
+ dfs = read_excel(pth, sheetname=None)
+ expected_keys = ['Alpha', 'Beta', 'Charlie']
+ tm.assert_contains_all(expected_keys, dfs.keys())
def test_reading_multiple_specific_sheets(self):
# Test reading specific sheetnames by specifying a mixed list
# of integers and strings, and confirm that duplicated sheet
# references (positions/names) are removed properly.
-
# Ensure a dict is returned
# See PR #9450
- _skip_if_no_xlrd()
-
- #Explicitly request duplicates. Only the set should be returned.
- expected_keys = [2,'Charlie','Charlie']
- dfs = read_excel(self.multisheet,sheetname=expected_keys)
+ pth = os.path.join(self.dirpath, 'test_multisheet' + self.ext)
+ # Explicitly request duplicates. Only the set should be returned.
+ expected_keys = [2, 'Charlie', 'Charlie']
+ dfs = read_excel(pth, sheetname=expected_keys)
expected_keys = list(set(expected_keys))
- tm.assert_contains_all(expected_keys,dfs.keys())
+ tm.assert_contains_all(expected_keys, dfs.keys())
assert len(expected_keys) == len(dfs.keys())
+
+class OdsReaderTests(ReadingTestsBase, tm.TestCase):
+ ext = '.ods'
+ engine_name = 'ezodf'
+ check_skip = staticmethod(_skip_if_no_ezodf)
+
+ def test_read_ezodf_book(self):
+
+ import ezodf
+ pth = os.path.join(self.dirpath, 'test1' + self.ext)
+ book = ezodf.opendoc(pth)
+ result1 = ExcelFile(book).parse()
+ result2 = read_excel(book)
+
+ df = read_excel(pth)
+ tm.assert_frame_equal(df, result1)
+ tm.assert_frame_equal(df, result2)
+
+ def test_types_datetime(self):
+
+ expected = DataFrame.from_items([
+ ("UnicodeCol", ['øø', 'ææ', 'åå', 'oø', '€£$¥', '£@$', 'ÅøØæÆ@']),
+ ("ExpCol", [8.50E-010, 8.50E+012, 9.00E-055, 8.50E+011, 8.5E-10,
+ 5E-10, 5E-10]),
+ ("BoolCol", [True, False, True, True, False, False, False]),
+ ("TimeCol", [time(hour=23, microsecond=1),
+ time(hour=2),
+ time(hour=1, minute=1, second=1),
+ timedelta(days=1, hours=2, minutes=1, seconds=1,
+ microseconds=1),
+ timedelta(hours=866, minutes=1, seconds=1,
+ microseconds=1),
+ time(2, 59, 40, 500000),
+ time(23, 59, 59, 100)]),
+ ("DateTimeCol", [datetime(2014, 10, 10, 10),
+ datetime(1900, 2, 1, 2),
+ datetime(2014, 1, 1, 23, 15, 15),
+ datetime(2011, 2, 3, 4, 5, 6),
+ datetime(1900, 7, 8, 9, 0, 1),
+ datetime(2015, 5, 7, 9, 33, 23),
+ datetime(2015, 5, 7, 2, 33, 23, 300000)]),
+ ("DateCol", [datetime(2014,3,2), datetime(1900,2,1),
+ datetime(1899,12,30), datetime(2100,12,11),
+ datetime(1850,11,3), datetime(2950,11,3),
+ datetime(2015,7,6)]),
+ ("TimeInDateFormat", [datetime(1899,12,30,1) for k in range(7)])
+ ])
+
+ pth = os.path.join(self.dirpath, 'test_types_datetime' + self.ext)
+ dfs = read_excel(pth)
+ tm.assert_frame_equal(dfs, expected)
+
+
+class XlrdTests(ReadingTestsBase):
+ """
+ This is the base class for the xlrd tests, and 3 different file formats
+ are supported: xls, xlsx, xlsm
+ """
+
+ def test_excel_read_buffer(self):
+
+ pth = os.path.join(self.dirpath, 'test1' + self.ext)
+ f = open(pth, 'rb')
+ xls = ExcelFile(f)
+ # it works
+ xls.parse('Sheet1', index_col=0, parse_dates=True)
+
+ def test_read_xlrd_Book(self):
+ _skip_if_no_xlwt()
+
+ import xlrd
+ df = self.frame
+ with ensure_clean('.xls') as pth:
+ df.to_excel(pth, "SheetA")
+ book = xlrd.open_workbook(pth)
+
+ with ExcelFile(book, engine="xlrd") as xl:
+ result = xl.parse("SheetA")
+ tm.assert_frame_equal(df, result)
+
+ result = read_excel(book, sheetname="SheetA", engine="xlrd")
+ tm.assert_frame_equal(df, result)
+
+ @tm.network
+ def test_read_from_http_url(self):
+ # TODO: remove this when merging into master
+ url = ('https://raw.github.com/davidovitch/pandas/master/'
+ 'pandas/io/tests/data/test1' + self.ext)
+# url = ('https://raw.github.com/pydata/pandas/master/'
+# 'pandas/io/tests/data/test' + self.ext)
+ url_table = read_excel(url)
+ dirpath = tm.get_data_path()
+ localtable = os.path.join(dirpath, 'test1' + self.ext)
+ local_table = read_excel(localtable)
+ tm.assert_frame_equal(url_table, local_table)
+
+ @slow
+ def test_read_from_file_url(self):
+
+ # FILE
+ if sys.version_info[:2] < (2, 6):
+ raise nose.SkipTest("file:// not supported with Python < 2.6")
+ dirpath = tm.get_data_path()
+ localtable = os.path.join(dirpath, 'test1' + self.ext)
+ local_table = read_excel(localtable)
+
+ try:
+ url_table = read_excel('file://localhost/' + localtable)
+ except URLError:
+ # fails on some systems
+ import platform
+ raise nose.SkipTest("failing on %s" %
+ ' '.join(platform.uname()).strip())
+
+ tm.assert_frame_equal(url_table, local_table)
+
+ def test_reader_closes_file(self):
+
+ pth = os.path.join(self.dirpath, 'test1' + self.ext)
+ f = open(pth, 'rb')
+ with ExcelFile(f) as xlsx:
+ # parses okay
+ xlsx.parse('Sheet1', index_col=0)
+
+ self.assertTrue(f.closed)
+
def test_creating_and_reading_multiple_sheets(self):
# Test reading multiple sheets, from a runtime created excel file
# with multiple sheets.
# See PR #9450
- _skip_if_no_xlrd()
_skip_if_no_xlwt()
+ _skip_if_no_openpyxl()
def tdf(sheetname):
d, i = [11,22,33], [1,2,3]
@@ -468,9 +495,9 @@ def tdf(sheetname):
dfs = [tdf(s) for s in sheets]
dfs = dict(zip(sheets,dfs))
- with ensure_clean('.xlsx') as pth:
+ with ensure_clean(self.ext) as pth:
with ExcelWriter(pth) as ew:
- for sheetname, df in iteritems(dfs):
+ for sheetname, df in compat.iteritems(dfs):
df.to_excel(ew,sheetname)
dfs_returned = pd.read_excel(pth,sheetname=sheets)
for s in sheets:
@@ -478,7 +505,6 @@ def tdf(sheetname):
def test_reader_seconds(self):
# Test reading times with and without milliseconds. GH5945.
- _skip_if_no_xlrd()
import xlrd
if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"):
@@ -510,8 +536,8 @@ def test_reader_seconds(self):
time(16, 37, 1),
time(18, 20, 54)])])
- epoch_1900 = os.path.join(self.dirpath, 'times_1900.xls')
- epoch_1904 = os.path.join(self.dirpath, 'times_1904.xls')
+ epoch_1900 = os.path.join(self.dirpath, 'times_1900' + self.ext)
+ epoch_1904 = os.path.join(self.dirpath, 'times_1904' + self.ext)
actual = read_excel(epoch_1900, 'Sheet1')
tm.assert_frame_equal(actual, expected)
@@ -543,6 +569,24 @@ def test_read_excel_blank_with_header(self):
actual = read_excel(blank, 'Sheet1')
tm.assert_frame_equal(actual, expected)
+class XlsReaderTests(XlrdTests, tm.TestCase):
+ ext = '.xls'
+ engine_name = 'xlrd'
+ check_skip = staticmethod(_skip_if_no_xlrd)
+
+
+class XlsxReaderTests(XlrdTests, tm.TestCase):
+ ext = '.xlsx'
+ engine_name = 'xlrd'
+ check_skip = staticmethod(_skip_if_no_xlrd)
+
+
+class XlsmReaderTests(XlrdTests, tm.TestCase):
+ ext = '.xlsm'
+ engine_name = 'xlrd'
+ check_skip = staticmethod(_skip_if_no_xlrd)
+
+
class ExcelWriterBase(SharedItems):
# Base class for test cases to run with different Excel writers.
# To add a writer test, define the following:
@@ -1269,6 +1313,8 @@ def test_datetimes(self):
# GH7074
def test_bytes_io(self):
+ _skip_if_no_xlrd()
+
bio = BytesIO()
df = DataFrame(np.random.randn(10, 2))
writer = ExcelWriter(bio)
@@ -1280,6 +1326,8 @@ def test_bytes_io(self):
# GH8188
def test_write_lists_dict(self):
+ _skip_if_no_xlrd()
+
df = pd.DataFrame({'mixed': ['a', ['b', 'c'], {'d': 'e', 'f': 2}],
'numeric': [1, 2, 3.0],
'str': ['apple', 'banana', 'cherry']})
@@ -1291,6 +1339,7 @@ def test_write_lists_dict(self):
read = read_excel(path, 'Sheet1', header=0)
tm.assert_frame_equal(read, expected)
+
def raise_wrapper(major_ver):
def versioned_raise_wrapper(orig_method):
@functools.wraps(orig_method)