diff --git a/doc/source/io.rst b/doc/source/io.rst index f8fe6fc8a4c3a..ff031ccc88ddf 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1949,56 +1949,106 @@ module and use the same parsing code as the above to convert tabular data into a DataFrame. See the :ref:`cookbook` for some advanced strategies -Besides ``read_excel`` you can also read Excel files using the ``ExcelFile`` -class. The following two commands are equivalent: +Reading Excel Files +~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 0.16 + +``read_excel`` can read more than one sheet, by setting ``sheetname`` to either +a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. + +.. versionadded:: 0.13 + +Sheets can be specified by sheet index or sheet name, using an integer or string, +respectively. + +.. versionadded:: 0.12 + +``ExcelFile`` has been moved to the top level namespace. + +There are two approaches to reading an excel file. The ``read_excel`` function +and the ``ExcelFile`` class. ``read_excel`` is for reading one file +with file-specific arguments (ie. identical data formats across sheets). +``ExcelFile`` is for reading one file with sheet-specific arguments (ie. various data +formats across sheets). Choosing the approach is largely a question of +code readability and execution speed. + +Equivalent class and function approaches to read a single sheet: .. code-block:: python # using the ExcelFile class xls = pd.ExcelFile('path_to_file.xls') - xls.parse('Sheet1', index_col=None, na_values=['NA']) + data = xls.parse('Sheet1', index_col=None, na_values=['NA']) # using the read_excel function - read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + data = read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) -The class based approach can be used to read multiple sheets or to introspect -the sheet names using the ``sheet_names`` attribute. +Equivalent class and function approaches to read multiple sheets: -.. note:: +.. code-block:: python - The prior method of accessing ``ExcelFile`` has been moved from - ``pandas.io.parsers`` to the top level namespace starting from pandas - 0.12.0. + data = {} + # For when Sheet1's format differs from Sheet2 + xls = pd.ExcelFile('path_to_file.xls') + data['Sheet1'] = xls.parse('Sheet1', index_col=None, na_values=['NA']) + data['Sheet2'] = xls.parse('Sheet2', index_col=1) + + # For when Sheet1's format is identical to Sheet2 + data = read_excel('path_to_file.xls', ['Sheet1','Sheet2'], index_col=None, na_values=['NA']) + +Specifying Sheets ++++++++++++++++++ +.. _io.specifying_sheets: -.. versionadded:: 0.13 +.. note :: The second argument is ``sheetname``, not to be confused with ``ExcelFile.sheet_names`` -There are now two ways to read in sheets from an Excel file. You can provide -either the index of a sheet or its name to by passing different values for -``sheet_name``. +.. note :: An ExcelFile's attribute ``sheet_names`` provides access to a list of sheets. +- The arguments ``sheetname`` allows specifying the sheet or sheets to read. +- The default value for ``sheetname`` is 0, indicating to read the first sheet - Pass a string to refer to the name of a particular sheet in the workbook. - Pass an integer to refer to the index of a sheet. Indices follow Python convention, beginning at 0. -- The default value is ``sheet_name=0``. This reads the first sheet. - -Using the sheet name: +- Pass a list of either strings or integers, to return a dictionary of specified sheets. +- Pass a ``None`` to return a dictionary of all available sheets. .. code-block:: python + # Returns a DataFrame read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) Using the sheet index: .. code-block:: python - read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) + # Returns a DataFrame + read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) Using all default values: .. code-block:: python + # Returns a DataFrame read_excel('path_to_file.xls') +Using None to get all sheets: + +.. code-block:: python + + # Returns a dictionary of DataFrames + read_excel('path_to_file.xls',sheetname=None) + +Using a list to get multiple sheets: + +.. code-block:: python + + # Returns the 1st and 4th sheet, as a dictionary of DataFrames. + read_excel('path_to_file.xls',sheetname=['Sheet1',3]) + +Parsing Specific Columns +++++++++++++++++++++++++ + It is often the case that users will insert columns to do temporary computations in Excel and you may not want to read in those columns. `read_excel` takes a `parse_cols` keyword to allow you to specify a subset of columns to parse. @@ -2017,26 +2067,30 @@ indices to be parsed. read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3]) -.. note:: +Cell Converters ++++++++++++++++ - It is possible to transform the contents of Excel cells via the `converters` - option. For instance, to convert a column to boolean: +It is possible to transform the contents of Excel cells via the `converters` +option. For instance, to convert a column to boolean: - .. code-block:: python +.. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) + read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) - This options handles missing values and treats exceptions in the converters - as missing data. Transformations are applied cell by cell rather than to the - column as a whole, so the array dtype is not guaranteed. For instance, a - column of integers with missing values cannot be transformed to an array - with integer dtype, because NaN is strictly a float. You can manually mask - missing data to recover integer dtype: +This options handles missing values and treats exceptions in the converters +as missing data. Transformations are applied cell by cell rather than to the +column as a whole, so the array dtype is not guaranteed. For instance, a +column of integers with missing values cannot be transformed to an array +with integer dtype, because NaN is strictly a float. You can manually mask +missing data to recover integer dtype: - .. code-block:: python +.. code-block:: python - cfun = lambda x: int(x) if x else -1 - read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + cfun = lambda x: int(x) if x else -1 + read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + +Writing Excel Files +~~~~~~~~~~~~~~~~~~~ To write a DataFrame object to a sheet of an Excel file, you can use the ``to_excel`` instance method. The arguments are largely the same as ``to_csv`` diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index c12513e087619..bee77d35674f3 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -190,6 +190,14 @@ Enhancements - Added ``StringMethods.find()`` and ``rfind()`` which behave as the same as standard ``str`` (:issue:`9386`) - Added ``StringMethods.isnumeric`` and ``isdecimal`` which behave as the same as standard ``str`` (:issue:`9439`) +- The ``read_excel()`` function's :ref:`sheetname <_io.specifying_sheets>` argument now accepts a list and ``None``, to get multiple or all sheets respectively. If more than one sheet is specified, a dictionary is returned. (:issue:`9450`) + +.. code-block:: python + + # Returns the 1st and 4th sheet, as a dictionary of DataFrames. + pd.read_excel('path_to_file.xls',sheetname=['Sheet1',3]) + +- A ``verbose`` argument has been augmented in ``io.read_excel()``, defaults to False. Set to True to print sheet names as they are parsed. (:issue:`9450`) - Added ``StringMethods.ljust()`` and ``rjust()`` which behave as the same as standard ``str`` (:issue:`9352`) - ``StringMethods.pad()`` and ``center()`` now accept ``fillchar`` option to specify filling character (:issue:`9352`) - Added ``StringMethods.zfill()`` which behave as the same as standard ``str`` (:issue:`9387`) diff --git a/pandas/io/excel.py b/pandas/io/excel.py index acec411a2e546..cab342dc339f4 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -75,8 +75,26 @@ def read_excel(io, sheetname=0, **kwds): The string could be a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file://localhost/path/to/workbook.xlsx - sheetname : string or int, default 0 - Name of Excel sheet or the page number of the sheet + sheetname : string, int, mixed list of strings/ints, or None, default 0 + + Strings are used for sheet names, Integers are used in zero-indexed sheet + positions. + + Lists of strings/integers are used to request multiple sheets. + + Specify None to get all sheets. + + str|int -> DataFrame is returned. + list|None -> Dict of DataFrames is returned, with keys representing sheets. + + Available Cases + + * Defaults to 0 -> 1st sheet as a DataFrame + * 1 -> 2nd sheet as a DataFrame + * "Sheet1" -> 1st sheet as a DataFrame + * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames + * None -> All sheets as a dictionary of DataFrames + header : int, default 0 Row to use for the column labels of the parsed DataFrame skiprows : list-like @@ -118,8 +136,9 @@ def read_excel(io, sheetname=0, **kwds): Returns ------- - parsed : DataFrame - DataFrame from the passed in Excel file + parsed : DataFrame or Dict of DataFrames + DataFrame from the passed in Excel file. See notes in sheetname argument + for more information on when a Dict of Dataframes is returned. """ if 'kind' in kwds: @@ -185,8 +204,25 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, Parameters ---------- - sheetname : string or integer - Name of Excel sheet or the page number of the sheet + sheetname : string, int, mixed list of strings/ints, or None, default 0 + + Strings are used for sheet names, Integers are used in zero-indexed sheet + positions. + + Lists of strings/integers are used to request multiple sheets. + + Specify None to get all sheets. + + str|int -> DataFrame is returned. + list|None -> Dict of DataFrames is returned, with keys representing sheets. + + Available Cases + + * Defaults to 0 -> 1st sheet as a DataFrame + * 1 -> 2nd sheet as a DataFrame + * "Sheet1" -> 1st sheet as a DataFrame + * [0,1,"Sheet5"] -> 1st, 2nd & 5th sheet as a dictionary of DataFrames + * None -> All sheets as a dictionary of DataFrames header : int, default 0 Row to use for the column labels of the parsed DataFrame skiprows : list-like @@ -223,11 +259,15 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, has_index_names : boolean, default False True if the cols defined in index_col have an index name and are not in the header + verbose : boolean, default False + Set to True to print a single statement when reading each + excel sheet. Returns ------- - parsed : DataFrame - DataFrame parsed from the Excel file + parsed : DataFrame or Dict of DataFrames + DataFrame from the passed in Excel file. See notes in sheetname argument + for more information on when a Dict of Dataframes is returned. """ skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: @@ -283,7 +323,7 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, - **kwds): + verbose=False, **kwds): import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, @@ -291,81 +331,114 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, epoch1904 = self.book.datemode + def _parse_cell(cell_contents,cell_typ): + """converts the contents of the cell into a pandas + appropriate object""" + + if cell_typ == XL_CELL_DATE: + if xlrd_0_9_3: + # Use the newer xlrd datetime handling. + cell_contents = xldate.xldate_as_datetime(cell_contents, + epoch1904) + + # Excel doesn't distinguish between dates and time, + # so we treat dates on the epoch as times only. + # Also, Excel supports 1900 and 1904 epochs. + year = (cell_contents.timetuple())[0:3] + if ((not epoch1904 and year == (1899, 12, 31)) + or (epoch1904 and year == (1904, 1, 1))): + cell_contents = datetime.time(cell_contents.hour, + cell_contents.minute, + cell_contents.second, + cell_contents.microsecond) + else: + # Use the xlrd <= 0.9.2 date handling. + dt = xldate.xldate_as_tuple(cell_contents, epoch1904) + + if dt[0] < datetime.MINYEAR: + cell_contents = datetime.time(*dt[3:]) + else: + cell_contents = datetime.datetime(*dt) + + elif cell_typ == XL_CELL_ERROR: + cell_contents = np.nan + elif cell_typ == XL_CELL_BOOLEAN: + cell_contents = bool(cell_contents) + elif convert_float and cell_typ == XL_CELL_NUMBER: + # GH5394 - Excel 'numbers' are always floats + # it's a minimal perf hit and less suprising + val = int(cell_contents) + if val == cell_contents: + cell_contents = val + return cell_contents + # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False - - if isinstance(sheetname, compat.string_types): - sheet = self.book.sheet_by_name(sheetname) - else: # assume an integer if not a string - sheet = self.book.sheet_by_index(sheetname) - - data = [] - should_parse = {} - for i in range(sheet.nrows): - row = [] - for j, (value, typ) in enumerate(zip(sheet.row_values(i), - sheet.row_types(i))): - if parse_cols is not None and j not in should_parse: - should_parse[j] = self._should_parse(j, parse_cols) - - if parse_cols is None or should_parse[j]: - if typ == XL_CELL_DATE: - if xlrd_0_9_3: - # Use the newer xlrd datetime handling. - value = xldate.xldate_as_datetime(value, epoch1904) - - # Excel doesn't distinguish between dates and time, - # so we treat dates on the epoch as times only. - # Also, Excel supports 1900 and 1904 epochs. - year = (value.timetuple())[0:3] - if ((not epoch1904 and year == (1899, 12, 31)) - or (epoch1904 and year == (1904, 1, 1))): - value = datetime.time(value.hour, - value.minute, - value.second, - value.microsecond) - else: - # Use the xlrd <= 0.9.2 date handling. - dt = xldate.xldate_as_tuple(value, epoch1904) - - if dt[0] < datetime.MINYEAR: - value = datetime.time(*dt[3:]) - else: - value = datetime.datetime(*dt) - - elif typ == XL_CELL_ERROR: - value = np.nan - elif typ == XL_CELL_BOOLEAN: - value = bool(value) - elif convert_float and typ == XL_CELL_NUMBER: - # GH5394 - Excel 'numbers' are always floats - # it's a minimal perf hit and less suprising - val = int(value) - if val == value: - value = val - - row.append(value) - - data.append(row) - - if header is not None: - data[header] = _trim_excel_header(data[header]) - - parser = TextParser(data, header=header, index_col=index_col, - has_index_names=has_index_names, - na_values=na_values, - thousands=thousands, - parse_dates=parse_dates, - date_parser=date_parser, - skiprows=skiprows, - skip_footer=skip_footer, - chunksize=chunksize, - **kwds) - - return parser.read() + + ret_dict = False + + #Keep sheetname to maintain backwards compatibility. + if isinstance(sheetname, list): + sheets = sheetname + ret_dict = True + elif sheetname is None: + sheets = self.sheet_names + ret_dict = True + else: + sheets = [sheetname] + + #handle same-type duplicates. + sheets = list(set(sheets)) + + output = {} + + for asheetname in sheets: + if verbose: + print("Reading sheet %s" % asheetname) + + if isinstance(asheetname, compat.string_types): + sheet = self.book.sheet_by_name(asheetname) + else: # assume an integer if not a string + sheet = self.book.sheet_by_index(asheetname) + + data = [] + should_parse = {} + + for i in range(sheet.nrows): + row = [] + for j, (value, typ) in enumerate(zip(sheet.row_values(i), + sheet.row_types(i))): + if parse_cols is not None and j not in should_parse: + should_parse[j] = self._should_parse(j, parse_cols) + + if parse_cols is None or should_parse[j]: + row.append(_parse_cell(value,typ)) + data.append(row) + + if header is not None: + data[header] = _trim_excel_header(data[header]) + + parser = TextParser(data, header=header, index_col=index_col, + has_index_names=has_index_names, + na_values=na_values, + thousands=thousands, + parse_dates=parse_dates, + date_parser=date_parser, + skiprows=skiprows, + skip_footer=skip_footer, + chunksize=chunksize, + **kwds) + + output[asheetname] = parser.read() + + if ret_dict: + return output + else: + return output[asheetname] + @property def sheet_names(self): diff --git a/pandas/io/tests/data/test_multisheet.xlsx b/pandas/io/tests/data/test_multisheet.xlsx new file mode 100644 index 0000000000000..5de07772b276a Binary files /dev/null and b/pandas/io/tests/data/test_multisheet.xlsx differ diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 5909f8af0e5dd..95f072835f2b6 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -80,6 +80,7 @@ def setUp(self): self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') self.xlsx1 = os.path.join(self.dirpath, 'test.xlsx') + self.multisheet = os.path.join(self.dirpath, 'test_multisheet.xlsx') self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() @@ -423,7 +424,59 @@ def test_reader_converters(self): for path in (xls_path, xlsx_path): actual = read_excel(path, 'Sheet1', converters=converters) tm.assert_frame_equal(actual, expected) - + + def test_reading_all_sheets(self): + # Test reading all sheetnames by setting sheetname to None, + # Ensure a dict is returned. + # See PR #9450 + + _skip_if_no_xlrd() + + dfs = read_excel(self.multisheet,sheetname=None) + expected_keys = ['Alpha','Beta','Charlie'] + tm.assert_contains_all(expected_keys,dfs.keys()) + + def test_reading_multiple_specific_sheets(self): + # Test reading specific sheetnames by specifying a mixed list + # of integers and strings, and confirm that duplicated sheet + # references (positions/names) are removed properly. + + # Ensure a dict is returned + # See PR #9450 + _skip_if_no_xlrd() + + #Explicitly request duplicates. Only the set should be returned. + expected_keys = [2,'Charlie','Charlie'] + dfs = read_excel(self.multisheet,sheetname=expected_keys) + expected_keys = list(set(expected_keys)) + tm.assert_contains_all(expected_keys,dfs.keys()) + assert len(expected_keys) == len(dfs.keys()) + + def test_creating_and_reading_multiple_sheets(self): + # Test reading multiple sheets, from a runtime created excel file + # with multiple sheets. + # See PR #9450 + + _skip_if_no_xlrd() + _skip_if_no_xlwt() + + def tdf(sheetname): + d, i = [11,22,33], [1,2,3] + return DataFrame(d,i,columns=[sheetname]) + + sheets = ['AAA','BBB','CCC'] + + dfs = [tdf(s) for s in sheets] + dfs = dict(zip(sheets,dfs)) + + with ensure_clean('.xlsx') as pth: + with ExcelWriter(pth) as ew: + for sheetname, df in dfs.iteritems(): + df.to_excel(ew,sheetname) + dfs_returned = pd.read_excel(pth,sheetname=sheets) + for s in sheets: + tm.assert_frame_equal(dfs[s],dfs_returned[s]) + def test_reader_seconds(self): # Test reading times with and without milliseconds. GH5945. _skip_if_no_xlrd()