diff --git a/doc/source/io.rst b/doc/source/io.rst index 25925ef4a8b91..f0556f5af8534 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -101,8 +101,9 @@ delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used as the delimiter. Equivalent to setting ``sep='\+s'``. If this option is set to True, nothing should be passed in for the - ``delimiter`` parameter. This parameter is currently supported for - the C parser only. + ``delimiter`` parameter. + + .. versionadded:: 0.18.1 support for the Python parser. Column and Index Locations and Names ++++++++++++++++++++++++++++++++++++ diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 8d45be77ecb65..a121de9869c98 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -74,6 +74,7 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiI Other Enhancements ^^^^^^^^^^^^^^^^^^ +- ``pd.read_csv()`` now supports ``delim_whitespace=True`` for the Python engine (:issue:`12958`) - ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV, via extension inference or explict ``compression='zip'`` (:issue:`12175`) - ``pd.read_csv()`` now supports opening files using xz compression, via extension inference or explicit ``compression='xz'`` is specified; ``xz`` compressions is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`) - ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`). diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4ee90599da23f..d90569e1aebb0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -57,7 +57,10 @@ Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used as the sep. Equivalent to setting ``sep='\+s'``. If this option is set to True, nothing should be passed in for the ``delimiter`` - parameter. This parameter is currently supported for the C parser only. + parameter. + + .. versionadded:: 0.18.1 support for the Python parser. + header : int or list of ints, default 'infer' Row number(s) to use as the column names, and the start of the data. Default behavior is as if set to 0 if no ``names`` passed, otherwise @@ -390,7 +393,20 @@ def _read(filepath_or_buffer, kwds): } _c_unsupported = set(['skip_footer']) -_python_unsupported = set(_c_parser_defaults.keys()) +_python_unsupported = set([ + 'as_recarray', + 'na_filter', + 'compact_ints', + 'use_unsigned', + 'low_memory', + 'memory_map', + 'buffer_lines', + 'error_bad_lines', + 'warn_bad_lines', + 'dtype', + 'decimal', + 'float_precision', +]) def _make_parser_function(name, sep=','): @@ -647,8 +663,13 @@ def _get_options_with_defaults(self, engine): value = kwds[argname] if engine != 'c' and value != default: - raise ValueError('The %r option is not supported with the' - ' %r engine' % (argname, engine)) + if ('python' in engine and + argname not in _python_unsupported): + pass + else: + raise ValueError( + 'The %r option is not supported with the' + ' %r engine' % (argname, engine)) else: value = default options[argname] = value @@ -691,6 +712,9 @@ def _clean_options(self, options, engine): " different from '\s+' are"\ " interpreted as regex)" engine = 'python' + elif delim_whitespace: + if 'python' in engine: + result['delimiter'] = '\s+' if fallback_reason and engine_specified: raise ValueError(fallback_reason) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 6332116401275..3c1a918bd5628 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1878,18 +1878,6 @@ def test_read_table_buglet_4x_multiindex(self): df = self.read_table(StringIO(text), sep='\s+') self.assertEqual(df.index.names, ('one', 'two', 'three', 'four')) - def test_line_comment(self): - data = """# empty -A,B,C -1,2.,4.#hello world -#ignore this line -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - df = self.read_csv(StringIO(data), comment='#') - tm.assert_almost_equal(df.values, expected) - def test_comment_skiprows(self): data = """# empty random line @@ -2404,20 +2392,6 @@ def test_nrows_and_chunksize_raises_notimplemented(self): self.assertRaises(NotImplementedError, self.read_csv, StringIO(data), nrows=10, chunksize=5) - def test_single_char_leading_whitespace(self): - # GH 9710 - data = """\ -MyColumn - a - b - a - b\n""" - - expected = DataFrame({'MyColumn': list('abab')}) - - result = self.read_csv(StringIO(data), skipinitialspace=True) - tm.assert_frame_equal(result, expected) - def test_chunk_begins_with_newline_whitespace(self): # GH 10022 data = '\n hello\nworld\n' @@ -2952,6 +2926,103 @@ def test_skiprow_with_newline_and_quote(self): df = self.read_csv(StringIO(data), skiprows=[1]) tm.assert_frame_equal(df, expected) + def test_line_comment(self): + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data), comment='#') + tm.assert_almost_equal(df.values, expected) + # check with delim_whitespace=True + df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#', + delim_whitespace=True) + tm.assert_almost_equal(df.values, expected) + + def test_skiprows_lineterminator(self): + # see gh-9079 + data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', + '2007/01/01 01:00 0.2140 U M ', + '2007/01/01 02:00 0.2141 M O ', + '2007/01/01 04:00 0.2142 D M ']) + expected = pd.DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], + ['2007/01/01', '02:00', 0.2141, 'M', 'O'], + ['2007/01/01', '04:00', 0.2142, 'D', 'M']], + columns=['date', 'time', 'var', 'flag', + 'oflag']) + # test with default lineterminators LF and CRLF + # "CR" is not respected with the Python parser, so + # there is a separate test "test_skiprows_lineterminator_cr" + # in the C engine for that + df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + df = self.read_csv(StringIO(data.replace('\n', '\r\n')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + + def test_trailing_spaces(self): + data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" + expected = pd.DataFrame([[1., 2., 4.], + [5.1, np.nan, 10.]]) + + # gh-8661, gh-8679: this should ignore six lines including + # lines with trailing whitespace and blank lines + df = self.read_csv(StringIO(data.replace(',', ' ')), + header=None, delim_whitespace=True, + skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) + tm.assert_frame_equal(df, expected) + df = self.read_table(StringIO(data.replace(',', ' ')), + header=None, delim_whitespace=True, + skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) + tm.assert_frame_equal(df, expected) + + # gh-8983: test skipping set of rows after a row with trailing spaces + expected = pd.DataFrame({"A": [1., 5.1], "B": [2., np.nan], + "C": [4., 10]}) + df = self.read_table(StringIO(data.replace(',', ' ')), + delim_whitespace=True, + skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True) + tm.assert_frame_equal(df, expected) + + def test_raise_on_sep_with_delim_whitespace(self): + # see gh-6607 + data = 'a b c\n1 2 3' + with tm.assertRaisesRegexp(ValueError, 'you can only specify one'): + self.read_table(StringIO(data), sep='\s', delim_whitespace=True) + + def test_single_char_leading_whitespace(self): + # see gh-9710 + data = """\ +MyColumn + a + b + a + b\n""" + + expected = DataFrame({'MyColumn': list('abab')}) + + result = self.read_csv(StringIO(data), delim_whitespace=True, + skipinitialspace=True) + tm.assert_frame_equal(result, expected) + + result = self.read_csv(StringIO(data), skipinitialspace=True) + tm.assert_frame_equal(result, expected) + + def test_usecols_with_whitespace(self): + data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' + + result = self.read_csv(StringIO(data), delim_whitespace=True, + usecols=('a', 'b')) + expected = DataFrame({'a': ['apple', 'orange'], + 'b': ['bat', 'cow']}, index=[4, 8]) + + tm.assert_frame_equal(result, expected) + class CompressionTests(object): def test_zip(self): @@ -3770,18 +3841,6 @@ def test_read_table_buglet_4x_multiindex(self): actual = self.read_table(StringIO(data), sep='\s+') tm.assert_frame_equal(actual, expected) - def test_line_comment(self): - data = """# empty -A,B,C -1,2.,4.#hello world -#ignore this line -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - df = self.read_csv(StringIO(data), comment='#') - tm.assert_almost_equal(df.values, expected) - def test_empty_lines(self): data = """\ A,B,C @@ -3972,6 +4031,45 @@ def test_delim_whitespace_custom_terminator(self): columns=['a', 'b', 'c']) tm.assert_frame_equal(df, expected) + def test_line_comment_customterm(self): + # TODO: move into ParserTests once Python supports custom terminator + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + expected = [[1., 2., 4.], + [5., np.nan, 10.]] + df = self.read_csv(StringIO(data.replace('\n', '*')), comment='#', + lineterminator='*') + tm.assert_almost_equal(df.values, expected) + + def test_skiprows_lineterminator_cr(self): + # see gh-9079 + # TODO: move into ParserTests once Python supports custom terminator + data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', + '2007/01/01 01:00 0.2140 U M ', + '2007/01/01 02:00 0.2141 M O ', + '2007/01/01 04:00 0.2142 D M ']) + expected = pd.DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], + ['2007/01/01', '02:00', 0.2141, 'M', 'O'], + ['2007/01/01', '04:00', 0.2142, 'D', 'M']], + columns=['date', 'time', 'var', 'flag', + 'oflag']) + # test with the three default lineterminators LF, CR and CRLF + df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + df = self.read_csv(StringIO(data.replace('\n', '\r')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + df = self.read_csv(StringIO(data.replace('\n', '\r\n')), + skiprows=1, delim_whitespace=True, + names=['date', 'time', 'var', 'flag', 'oflag']) + tm.assert_frame_equal(df, expected) + class TestCParserHighMemory(CParserTests, CompressionTests, tm.TestCase): engine = 'c' @@ -4018,26 +4116,6 @@ def test_usecols(self): raise nose.SkipTest( "Usecols is not supported in C High Memory engine.") - def test_line_comment(self): - data = """# empty -A,B,C -1,2.,4.#hello world -#ignore this line -5.,NaN,10.0 -""" - expected = [[1., 2., 4.], - [5., np.nan, 10.]] - df = self.read_csv(StringIO(data), comment='#') - tm.assert_almost_equal(df.values, expected) - # check with delim_whitespace=True - df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#', - delim_whitespace=True) - tm.assert_almost_equal(df.values, expected) - # check with custom line terminator - df = self.read_csv(StringIO(data.replace('\n', '*')), comment='#', - lineterminator='*') - tm.assert_almost_equal(df.values, expected) - def test_comment_skiprows(self): data = """# empty random line @@ -4053,53 +4131,6 @@ def test_comment_skiprows(self): df = self.read_csv(StringIO(data), comment='#', skiprows=4) tm.assert_almost_equal(df.values, expected) - def test_skiprows_lineterminator(self): - # GH #9079 - data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', - '2007/01/01 01:00 0.2140 U M ', - '2007/01/01 02:00 0.2141 M O ', - '2007/01/01 04:00 0.2142 D M ']) - expected = pd.DataFrame([['2007/01/01', '01:00', 0.2140, 'U', 'M'], - ['2007/01/01', '02:00', 0.2141, 'M', 'O'], - ['2007/01/01', '04:00', 0.2142, 'D', 'M']], - columns=['date', 'time', 'var', 'flag', - 'oflag']) - # test with the three default lineterminators LF, CR and CRLF - df = self.read_csv(StringIO(data), skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - df = self.read_csv(StringIO(data.replace('\n', '\r')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - df = self.read_csv(StringIO(data.replace('\n', '\r\n')), - skiprows=1, delim_whitespace=True, - names=['date', 'time', 'var', 'flag', 'oflag']) - tm.assert_frame_equal(df, expected) - - def test_trailing_spaces(self): - data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" - expected = pd.DataFrame([[1., 2., 4.], - [5.1, np.nan, 10.]]) - # this should ignore six lines including lines with trailing - # whitespace and blank lines. issues 8661, 8679 - df = self.read_csv(StringIO(data.replace(',', ' ')), - header=None, delim_whitespace=True, - skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) - tm.assert_frame_equal(df, expected) - df = self.read_table(StringIO(data.replace(',', ' ')), - header=None, delim_whitespace=True, - skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) - tm.assert_frame_equal(df, expected) - # test skipping set of rows after a row with trailing spaces, issue - # #8983 - expected = pd.DataFrame({"A": [1., 5.1], "B": [2., np.nan], - "C": [4., 10]}) - df = self.read_table(StringIO(data.replace(',', ' ')), - delim_whitespace=True, - skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True) - tm.assert_frame_equal(df, expected) - def test_comment_header(self): data = """# empty # second empty line @@ -4265,25 +4296,6 @@ def test_fallback_to_python(self): with tm.assertRaisesRegexp(ValueError, 'does not support'): self.read_table(StringIO(data), engine='c', skip_footer=1) - def test_single_char_leading_whitespace(self): - # GH 9710 - data = """\ -MyColumn - a - b - a - b\n""" - - expected = DataFrame({'MyColumn': list('abab')}) - - result = self.read_csv(StringIO(data), delim_whitespace=True, - skipinitialspace=True) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), lineterminator='\n', - skipinitialspace=True) - tm.assert_frame_equal(result, expected) - class TestCParserLowMemory(CParserTests, CompressionTests, tm.TestCase): @@ -4488,16 +4500,6 @@ def test_usecols_implicit_index_col(self): tm.assert_frame_equal(result, expected) - def test_usecols_with_whitespace(self): - data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' - - result = self.read_csv(StringIO(data), delim_whitespace=True, - usecols=('a', 'b')) - expected = DataFrame({'a': ['apple', 'orange'], - 'b': ['bat', 'cow']}, index=[4, 8]) - - tm.assert_frame_equal(result, expected) - def test_usecols_regex_sep(self): # #2733 data = 'a b c\n4 apple bat 5.7\n8 orange cow 10' @@ -4642,6 +4644,7 @@ def test_warn_if_chunks_have_mismatched_type(self): def test_invalid_c_parser_opts_with_not_c_parser(self): from pandas.io.parsers import _c_parser_defaults as c_defaults + from pandas.io.parsers import _python_unsupported as py_unsupported data = """1,2,3,, 1,2,3,4, @@ -4652,6 +4655,9 @@ def test_invalid_c_parser_opts_with_not_c_parser(self): engines = 'python', 'python-fwf' for default in c_defaults: for engine in engines: + if 'python' in engine and default not in py_unsupported: + continue + kwargs = {default: object()} with tm.assertRaisesRegexp(ValueError, 'The %r option is not supported ' @@ -4708,31 +4714,6 @@ def test_fallback_to_python(self): with tm.assertRaisesRegexp(ValueError, 'does not support'): self.read_table(StringIO(data), engine='c', skip_footer=1) - def test_raise_on_sep_with_delim_whitespace(self): - # GH 6607 - data = 'a b c\n1 2 3' - with tm.assertRaisesRegexp(ValueError, 'you can only specify one'): - self.read_table(StringIO(data), sep='\s', delim_whitespace=True) - - def test_single_char_leading_whitespace(self): - # GH 9710 - data = """\ -MyColumn - a - b - a - b\n""" - - expected = DataFrame({'MyColumn': list('abab')}) - - result = self.read_csv(StringIO(data), delim_whitespace=True, - skipinitialspace=True) - tm.assert_frame_equal(result, expected) - - result = self.read_csv(StringIO(data), lineterminator='\n', - skipinitialspace=True) - tm.assert_frame_equal(result, expected) - def test_bool_header_arg(self): # GH 6114 data = """\