From 71babd968bd33a3f881fc8669e34717b395178d3 Mon Sep 17 00:00:00 2001 From: Daniel Garrett Date: Tue, 27 Jun 2023 15:00:36 -0500 Subject: [PATCH 1/2] Updated read_excel docstring to match style guide formatting --- pandas/io/excel/_base.py | 144 +++++++++++++++++++-------------------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f4782dcfcc08d..9c434e25d909a 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -79,7 +79,7 @@ ) _read_excel_doc = ( """ -Read an Excel file into a pandas DataFrame. +Read an Excel file into a ``pandas`` ``DataFrame``. Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions read from a local filesystem or URL. Supports an option to read @@ -101,29 +101,29 @@ Strings are used for sheet names. Integers are used in zero-indexed sheet positions (chart sheets do not count as a sheet position). Lists of strings/integers are used to request multiple sheets. - Specify None to get all worksheets. + Specify ``None`` to get all worksheets. Available cases: - * Defaults to ``0``: 1st sheet as a `DataFrame` - * ``1``: 2nd sheet as a `DataFrame` + * Defaults to ``0``: 1st sheet as a ``DataFrame`` + * ``1``: 2nd sheet as a ``DataFrame`` * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" - as a dict of `DataFrame` - * None: All worksheets. + as a dict of ``DataFrame`` + * ``None``: All worksheets. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed - DataFrame. If a list of integers is passed those row positions will - be combined into a ``MultiIndex``. Use None if there is no header. + ``DataFrame``. If a list of integers is passed those row positions will + be combined into a ``MultiIndex``. Use ``None`` if there is no header. names : array-like, default None List of column names to use. If file contains no header row, - then you should explicitly pass header=None. + then you should explicitly pass ``header=None``. index_col : int, str, list of int, default None - Column (0-indexed) to use as the row labels of the DataFrame. + Column (0-indexed) to use as the row labels of the ``DataFrame``. Pass None if there is no such column. If a list is passed, those columns will be combined into a ``MultiIndex``. If a - subset of data is selected with ``usecols``, index_col + subset of data is selected with ``usecols``, ``index_col`` is based on the subset. Missing values will be forward filled to allow roundtripping with @@ -131,31 +131,31 @@ missing values use ``set_index`` after reading the data instead of ``index_col``. usecols : str, list-like, or callable, default None - * If None, then parse all columns. - * If str, then indicates comma separated list of Excel column letters + * If ``None``, then parse all columns. + * If ``str``, then indicates comma separated list of Excel column letters and column ranges (e.g. "A:E" or "A,C,E:F"). Ranges are inclusive of both sides. - * If list of int, then indicates list of column numbers to be parsed + * If list of ``int``, then indicates list of column numbers to be parsed (0-indexed). - * If list of string, then indicates list of column names to be parsed. + * If list of ``str``, then indicates list of column names to be parsed. * If callable, then evaluate each column name against it and parse the column if the callable returns ``True``. Returns a subset of the columns according to behavior above. dtype : Type name or dict of column -> type, default None - Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32}} + Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}`` Use `object` to preserve data as stored in Excel and not interpret dtype. If converters are specified, they will be applied INSTEAD - of dtype conversion. + of ``dtype`` conversion. engine : str, default None - If io is not a buffer or path, this must be set to identify io. - Supported engines: "xlrd", "openpyxl", "odf", "pyxlsb". + If ``io`` is not a buffer or path, this must be set to identify ``io``. + Supported engines: ``"xlrd"``, ``"openpyxl"``, ``"odf"``, ``"pyxlsb"``. Engine compatibility : - - "xlrd" supports old-style Excel files (.xls). - - "openpyxl" supports newer Excel file formats. - - "odf" supports OpenDocument file formats (.odf, .ods, .odt). - - "pyxlsb" supports Binary Excel files. + - ``"xlrd"`` supports old-style Excel files (.xls). + - ``"openpyxl"`` supports newer Excel file formats. + - ``"odf"`` supports OpenDocument file formats (.odf, .ods, .odt). + - ``"pyxlsb"`` supports Binary Excel files. .. versionchanged:: 1.2.0 The engine `xlrd `_ @@ -181,70 +181,70 @@ input argument, the Excel cell content, and return the transformed content. true_values : list, default None - Values to consider as True. + Values to consider as ``True``. false_values : list, default None - Values to consider as False. + Values to consider as ``False``. skiprows : list-like, int, or callable, optional - Line numbers to skip (0-indexed) or number of lines to skip (int) at the + Line numbers to skip (0-indexed) or number of lines to skip (``int``) at the start of the file. If callable, the callable function will be evaluated - against the row indices, returning True if the row should be skipped and + against the row indices, returning ``True`` if the row should be skipped and False otherwise. An example of a valid callable argument would be ``lambda x: x in [0, 2]``. nrows : int, default None Number of rows to parse. na_values : scalar, str, list-like, or dict, default None - Additional strings to recognize as NA/NaN. If dict passed, specific + Additional strings to recognize as NA/NaN. If ``dict`` passed, specific per-column NA values. By default the following values are interpreted as NaN: '""" + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ") + """'. keep_default_na : bool, default True - Whether or not to include the default NaN values when parsing the data. - Depending on whether `na_values` is passed in, the behavior is as follows: - - * If `keep_default_na` is True, and `na_values` are specified, `na_values` - is appended to the default NaN values used for parsing. - * If `keep_default_na` is True, and `na_values` are not specified, only - the default NaN values are used for parsing. - * If `keep_default_na` is False, and `na_values` are specified, only - the NaN values specified `na_values` are used for parsing. - * If `keep_default_na` is False, and `na_values` are not specified, no - strings will be parsed as NaN. - - Note that if `na_filter` is passed in as False, the `keep_default_na` and - `na_values` parameters will be ignored. + Whether or not to include the default ``NaN`` values when parsing the data. + Depending on whether ``na_values`` is passed in, the behavior is as follows: + + * If ``keep_default_na=True``, and ``na_values`` are specified, ``na_values`` + is appended to the default ``NaN`` values used for parsing. + * If ``keep_default_na=True``, and ``na_values`` are not specified, only + the default ``NaN`` values are used for parsing. + * If ``keep_default_na=False``, and ``na_values`` are specified, only + the ``NaN`` values specified ``na_values`` are used for parsing. + * If ``keep_default_na=False``, and ``na_values`` are not specified, no + strings will be parsed as ``NaN``. + + Note that if ``na_filter=False``, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. na_filter : bool, default True - Detect missing value markers (empty strings and the value of na_values). In - data without any NAs, passing na_filter=False can improve the performance + Detect missing value markers (empty strings and the value of ``na_values``). In + data without any NAs, ``passing na_filter=False`` can improve the performance of reading a large file. verbose : bool, default False Indicate number of NA values placed in non-numeric columns. parse_dates : bool, list-like, or dict, default False The behavior is as follows: - * bool. If True -> try parsing the index. - * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + * ``bool``. If True -> try parsing the index. + * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + * ``list`` of lists. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date column. - * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call - result 'foo' + * ``dict``, e.g. ``{'foo' : [1, 3]}`` -> parse columns 1, 3 as date and call + result ``'foo'`` If a column or index contains an unparsable date, the entire column or index will be returned unaltered as an object data type. If you don`t want to - parse some cells as date just change their type in Excel to "Text". - For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. + parse some cells as date, just change their type in Excel to "Text". + For non-standard ``datetime`` parsing, use ``pd.to_datetime`` after ``pd.read_excel``. Note: A fast-path exists for iso8601-formatted dates. date_parser : function, optional Function to use for converting a sequence of string columns to an array of - datetime instances. The default uses ``dateutil.parser.parser`` to do the - conversion. Pandas will try to call `date_parser` in three different ways, + ``datetime`` instances. The default uses ``dateutil.parser.parser`` to do the + conversion. Pandas will try to call ``date_parser`` in three different ways, advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the - string values from the columns defined by `parse_dates` into a single array - and pass that; and 3) call `date_parser` once for each row using one or - more strings (corresponding to the columns defined by `parse_dates`) as + (as defined by ``parse_dates``) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by ``parse_dates`` into a single array + and pass that; and 3) call ``date_parser`` once for each row using one or + more strings (corresponding to the columns defined by ``parse_dates``) as arguments. .. deprecated:: 2.0.0 @@ -279,13 +279,13 @@ .. versionadded:: 1.2.0 -dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames - Which dtype_backend to use, e.g. whether a DataFrame should have NumPy - arrays, nullable dtypes are used for all dtypes that have a nullable - implementation when "numpy_nullable" is set, pyarrow is used for all - dtypes if "pyarrow" is set. +dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to ``numpy`` backed ``DataFrames`` + Which ``dtype_backend`` to use, e.g. whether a ``DataFrame`` should have ``numpy`` + arrays, nullable ``dtypes`` are used for all ``dtypes`` that have a nullable + implementation when ``"numpy_nullable"`` is set, ``pyarrow`` is used for all + dtypes if ``"pyarrow"`` is set. - The dtype_backends are still experimential. + The ``dtype_backends`` are still experimential. .. versionadded:: 2.0 @@ -295,15 +295,15 @@ Returns ------- DataFrame or dict of DataFrames - DataFrame from the passed in Excel file. See notes in sheet_name - argument for more information on when a dict of DataFrames is returned. + ``DataFrame`` from the passed in Excel file. See notes in ``sheet_name`` + argument for more information on when a ``dict`` of ``DataFrames`` is returned. See Also -------- -DataFrame.to_excel : Write DataFrame to an Excel file. -DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. -read_csv : Read a comma-separated values (csv) file into DataFrame. -read_fwf : Read a table of fixed-width formatted lines into DataFrame. +DataFrame.to_excel : Write ``DataFrame`` to an Excel file. +DataFrame.to_csv : Write ``DataFrame`` to a comma-separated values (csv) file. +read_csv : Read a comma-separated values (csv) file into ``DataFrame``. +read_fwf : Read a table of fixed-width formatted lines into ``DataFrame``. Notes ----- @@ -327,7 +327,7 @@ 1 1 string2 2 2 2 #Comment 3 -Index and header can be specified via the `index_col` and `header` arguments +Index and header can be specified via the ``index_col`` and ``header`` arguments >>> pd.read_excel('tmp.xlsx', index_col=None, header=None) # doctest: +SKIP 0 1 2 @@ -345,7 +345,7 @@ 1 string2 2.0 2 #Comment 3.0 -True, False, and NA values, and thousands separators have defaults, +``True``, ``False``, ``NaN`` values, and thousands of separators have defaults, but can be explicitly specified, too. Supply the values you would like as strings or lists of strings! @@ -356,7 +356,7 @@ 1 NaN 2 2 #Comment 3 -Comment lines in the excel input file can be skipped using the `comment` kwarg +Comment lines in the excel input file can be skipped using the ``comment`` ``kwarg`` >>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP Name Value From 9e46790ae2e60f2d472c6ba3512f278edc04914b Mon Sep 17 00:00:00 2001 From: Daniel Garrett Date: Mon, 3 Jul 2023 15:10:25 -0500 Subject: [PATCH 2/2] Changes referenced in PR review comments from @rhshadrach --- pandas/io/excel/_base.py | 44 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9c434e25d909a..10f2422a9df1c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -79,7 +79,7 @@ ) _read_excel_doc = ( """ -Read an Excel file into a ``pandas`` ``DataFrame``. +Read an Excel file into a ``pandas`` DataFrame. Supports `xls`, `xlsx`, `xlsm`, `xlsb`, `odf`, `ods` and `odt` file extensions read from a local filesystem or URL. Supports an option to read @@ -105,22 +105,22 @@ Available cases: - * Defaults to ``0``: 1st sheet as a ``DataFrame`` - * ``1``: 2nd sheet as a ``DataFrame`` + * Defaults to ``0``: 1st sheet as a DataFrame + * ``1``: 2nd sheet as a DataFrame * ``"Sheet1"``: Load sheet with name "Sheet1" * ``[0, 1, "Sheet5"]``: Load first, second and sheet named "Sheet5" - as a dict of ``DataFrame`` + as a dict of DataFrame * ``None``: All worksheets. header : int, list of int, default 0 Row (0-indexed) to use for the column labels of the parsed - ``DataFrame``. If a list of integers is passed those row positions will + DataFrame. If a list of integers is passed those row positions will be combined into a ``MultiIndex``. Use ``None`` if there is no header. names : array-like, default None List of column names to use. If file contains no header row, then you should explicitly pass ``header=None``. index_col : int, str, list of int, default None - Column (0-indexed) to use as the row labels of the ``DataFrame``. + Column (0-indexed) to use as the row labels of the DataFrame. Pass None if there is no such column. If a list is passed, those columns will be combined into a ``MultiIndex``. If a subset of data is selected with ``usecols``, ``index_col`` @@ -143,7 +143,7 @@ Returns a subset of the columns according to behavior above. dtype : Type name or dict of column -> type, default None - Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}`` + Data type for data or columns. E.g. ``{{'a': np.float64, 'b': np.int32}}`` Use `object` to preserve data as stored in Excel and not interpret dtype. If converters are specified, they will be applied INSTEAD of ``dtype`` conversion. @@ -152,10 +152,10 @@ Supported engines: ``"xlrd"``, ``"openpyxl"``, ``"odf"``, ``"pyxlsb"``. Engine compatibility : - - ``"xlrd"`` supports old-style Excel files (.xls). - - ``"openpyxl"`` supports newer Excel file formats. - - ``"odf"`` supports OpenDocument file formats (.odf, .ods, .odt). - - ``"pyxlsb"`` supports Binary Excel files. + - ``xlrd`` supports old-style Excel files (.xls). + - ``openpyxl`` supports newer Excel file formats. + - ``odf`` supports OpenDocument file formats (.odf, .ods, .odt). + - ``pyxlsb`` supports Binary Excel files. .. versionchanged:: 1.2.0 The engine `xlrd `_ @@ -215,7 +215,7 @@ ``na_values`` parameters will be ignored. na_filter : bool, default True Detect missing value markers (empty strings and the value of ``na_values``). In - data without any NAs, ``passing na_filter=False`` can improve the performance + data without any NAs, passing ``na_filter=False`` can improve the performance of reading a large file. verbose : bool, default False Indicate number of NA values placed in non-numeric columns. @@ -233,7 +233,7 @@ If a column or index contains an unparsable date, the entire column or index will be returned unaltered as an object data type. If you don`t want to parse some cells as date, just change their type in Excel to "Text". - For non-standard ``datetime`` parsing, use ``pd.to_datetime`` after ``pd.read_excel``. + For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_excel``. Note: A fast-path exists for iso8601-formatted dates. date_parser : function, optional @@ -279,8 +279,8 @@ .. versionadded:: 1.2.0 -dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to ``numpy`` backed ``DataFrames`` - Which ``dtype_backend`` to use, e.g. whether a ``DataFrame`` should have ``numpy`` +dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed ``DataFrames`` + Which ``dtype_backend`` to use, e.g. whether a DataFrame should have NumPy arrays, nullable ``dtypes`` are used for all ``dtypes`` that have a nullable implementation when ``"numpy_nullable"`` is set, ``pyarrow`` is used for all dtypes if ``"pyarrow"`` is set. @@ -295,15 +295,15 @@ Returns ------- DataFrame or dict of DataFrames - ``DataFrame`` from the passed in Excel file. See notes in ``sheet_name`` + DataFrame from the passed in Excel file. See notes in ``sheet_name`` argument for more information on when a ``dict`` of ``DataFrames`` is returned. See Also -------- -DataFrame.to_excel : Write ``DataFrame`` to an Excel file. -DataFrame.to_csv : Write ``DataFrame`` to a comma-separated values (csv) file. -read_csv : Read a comma-separated values (csv) file into ``DataFrame``. -read_fwf : Read a table of fixed-width formatted lines into ``DataFrame``. +DataFrame.to_excel : Write DataFrame to an Excel file. +DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file. +read_csv : Read a comma-separated values (csv) file into DataFrame. +read_fwf : Read a table of fixed-width formatted lines into DataFrame. Notes ----- @@ -345,7 +345,7 @@ 1 string2 2.0 2 #Comment 3.0 -``True``, ``False``, ``NaN`` values, and thousands of separators have defaults, +True, False, NA values, and thousands of separators have defaults, but can be explicitly specified, too. Supply the values you would like as strings or lists of strings! @@ -356,7 +356,7 @@ 1 NaN 2 2 #Comment 3 -Comment lines in the excel input file can be skipped using the ``comment`` ``kwarg`` +Comment lines in the excel input file can be skipped using the ``comment`` keyword argument >>> pd.read_excel('tmp.xlsx', index_col=0, comment='#') # doctest: +SKIP Name Value