diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 53bcf6ffd7a8a..e39093fd35647 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1366,8 +1366,10 @@ a different usage of the ``delimiter`` parameter: * ``widths``: A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. * ``delimiter``: Characters to consider as filler characters in the fixed-width file. - Can be used to specify the filler character of the fields - if it is not spaces (e.g., '~'). + Default are space and tab characters. + Used to specify the character(s) to strip from start and end of every field. + To preserve whitespace, set to a character that does not exist in the data, + i.e. "\0". Consider a typical fixed-width data file: @@ -1404,8 +1406,9 @@ column widths for contiguous columns: df = pd.read_fwf("bar.csv", widths=widths, header=None) df -The parser will take care of extra white spaces around the columns +The parser will take care of extra whitespace around the columns, so it's ok to have extra separation between the columns in the file. +To preserve whitespace around the columns, see ``delimiter``. By default, ``read_fwf`` will try to infer the file's ``colspecs`` by using the first 100 rows of the file. It can do it only in cases when the columns are diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 575390e9b97a4..defe396813d5a 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -440,7 +440,12 @@ "float_precision": None, } -_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None} +_fwf_defaults = { + "colspecs": "infer", + "infer_nrows": 100, + "widths": None, + "delimiter": " ", # space & [TAB] +} _c_unsupported = {"skipfooter"} _python_unsupported = {"low_memory", "float_precision"} @@ -1236,6 +1241,7 @@ def read_fwf( *, colspecs: Sequence[tuple[int, int]] | str | None = "infer", widths: Sequence[int] | None = None, + delimiter: str | None = " \t", infer_nrows: int = 100, **kwds, ) -> DataFrame | TextFileReader: @@ -1256,7 +1262,7 @@ def read_fwf( Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. A local file could be: ``file://localhost/path/to/table.csv``. - colspecs : list of tuple (int, int) or 'infer'. optional + colspecs : list of tuple (int, int) or 'infer', optional A list of tuples giving the extents of the fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). String value 'infer' can be used to instruct the parser to try @@ -1265,6 +1271,10 @@ def read_fwf( widths : list of int, optional A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. + delimiter : str, default " \t" (space and tab), optional + Character(s) to strip from start and end of each field. To + preserve whitespace, must be non-default value (i.e. delimiter="\0"). + Used by `colspecs="infer"` to determine column boundaries. infer_nrows : int, default 100 The number of rows to consider when letting the parser determine the `colspecs`.