diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 330901ba56fbd..c4b483a794c21 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -44,6 +44,13 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then flake8 pandas/_libs --filename=*.pxi.in,*.pxd --select=E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 RET=$(($RET + $?)) ; echo $MSG "DONE" + echo "flake8-rst --version" + flake8-rst --version + + MSG='Linting code-blocks in .rst documentation' ; echo $MSG + flake8-rst doc/source --filename=*.rst + RET=$(($RET + $?)) ; echo $MSG "DONE" + # Check that cython casting is of the form `obj` as opposed to ` obj`; # it doesn't make a difference, but we want to be internally consistent. # Note: this grep pattern is (intended to be) equivalent to the python @@ -64,6 +71,9 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then cpplint --quiet --extensions=c,h --headers=h --recursive --filter=-readability/casting,-runtime/int,-build/include_subdir pandas/_libs/src/*.h pandas/_libs/src/parser pandas/_libs/ujson pandas/_libs/tslibs/src/datetime RET=$(($RET + $?)) ; echo $MSG "DONE" + echo "isort --version-number" + isort --version-number + # Imports - Check formatting using isort see setup.cfg for settings MSG='Check import format using isort ' ; echo $MSG isort --recursive --check-only pandas diff --git a/ci/deps/travis-36.yaml b/ci/deps/travis-36.yaml index 352717a842214..8aa551f6194d9 100644 --- a/ci/deps/travis-36.yaml +++ b/ci/deps/travis-36.yaml @@ -9,6 +9,7 @@ dependencies: - fastparquet - flake8>=3.5 - flake8-comprehensions + - flake8-rst - gcsfs - geopandas - html5lib diff --git a/ci/environment-dev.yaml b/ci/environment-dev.yaml index 3e69b1f725b24..2718c1cd582b6 100644 --- a/ci/environment-dev.yaml +++ b/ci/environment-dev.yaml @@ -7,6 +7,7 @@ dependencies: - NumPy - flake8 - flake8-comprehensions + - flake8-rst - hypothesis>=3.58.0 - isort - moto diff --git a/ci/requirements_dev.txt b/ci/requirements_dev.txt index 6a8b8d64d943b..a1cb20c265974 100644 --- a/ci/requirements_dev.txt +++ b/ci/requirements_dev.txt @@ -4,6 +4,7 @@ Cython>=0.28.2 NumPy flake8 flake8-comprehensions +flake8-rst hypothesis>=3.58.0 isort moto diff --git a/doc/source/10min.rst b/doc/source/10min.rst index fbbe94a72c71e..b5938a24ce6c5 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -45,7 +45,7 @@ a default integer index: .. ipython:: python - s = pd.Series([1,3,5,np.nan,6,8]) + s = pd.Series([1, 3, 5, np.nan, 6, 8]) s Creating a :class:`DataFrame` by passing a NumPy array, with a datetime index @@ -62,12 +62,12 @@ Creating a ``DataFrame`` by passing a dict of objects that can be converted to s .. ipython:: python - df2 = pd.DataFrame({ 'A' : 1., - 'B' : pd.Timestamp('20130102'), - 'C' : pd.Series(1,index=list(range(4)),dtype='float32'), - 'D' : np.array([3] * 4,dtype='int32'), - 'E' : pd.Categorical(["test","train","test","train"]), - 'F' : 'foo' }) + df2 = pd.DataFrame({'A': 1., + 'B': pd.Timestamp('20130102'), + 'C': pd.Series(1, index=list(range(4)),dtype='float32'), + 'D': np.array([3] * 4, dtype='int32'), + 'E': pd.Categorical(["test", "train", "test", "train"]), + 'F': 'foo'}) df2 The columns of the resulting ``DataFrame`` have different @@ -283,9 +283,9 @@ Using the :func:`~Series.isin` method for filtering: .. ipython:: python df2 = df.copy() - df2['E'] = ['one', 'one','two','three','four','three'] + df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three'] df2 - df2[df2['E'].isin(['two','four'])] + df2[df2['E'].isin(['two', 'four'])] Setting ~~~~~~~ @@ -295,7 +295,7 @@ by the indexes. .. ipython:: python - s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6)) + s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6)) s1 df['F'] = s1 @@ -394,7 +394,7 @@ In addition, pandas automatically broadcasts along the specified dimension. .. ipython:: python - s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2) + s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2) s df.sub(s, axis='index') @@ -492,7 +492,7 @@ section. .. ipython:: python - df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D']) + df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D']) df s = df.iloc[3] df.append(s, ignore_index=True) @@ -512,12 +512,12 @@ See the :ref:`Grouping section `. .. ipython:: python - df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B' : ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C' : np.random.randn(8), - 'D' : np.random.randn(8)}) + df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', + 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) df Grouping and then applying the :meth:`~DataFrame.sum` function to the resulting @@ -532,7 +532,7 @@ apply the ``sum`` function. .. ipython:: python - df.groupby(['A','B']).sum() + df.groupby(['A', 'B']).sum() Reshaping --------- @@ -578,11 +578,11 @@ See the section on :ref:`Pivot Tables `. .. ipython:: python - df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3, - 'B' : ['A', 'B', 'C'] * 4, - 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, - 'D' : np.random.randn(12), - 'E' : np.random.randn(12)}) + df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3, + 'B': ['A', 'B', 'C'] * 4, + 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, + 'D': np.random.randn(12), + 'E': np.random.randn(12)}) df We can produce pivot tables from this data very easily: @@ -653,7 +653,7 @@ pandas can include categorical data in a ``DataFrame``. For full docs, see the .. ipython:: python - df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) + df = pd.DataFrame({"id":[1, 2, 3, 4, 5, 6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) Convert the raw grades to a categorical data type. @@ -753,13 +753,13 @@ Writing to a HDF5 Store. .. ipython:: python - df.to_hdf('foo.h5','df') + df.to_hdf('foo.h5', 'df') Reading from a HDF5 Store. .. ipython:: python - pd.read_hdf('foo.h5','df') + pd.read_hdf('foo.h5', 'df') .. ipython:: python :suppress: @@ -796,7 +796,7 @@ If you are attempting to perform an operation you might see an exception like: .. code-block:: python >>> if pd.Series([False, True, False]): - print("I was true") + ... print("I was true") Traceback ... ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 608e2c8e72ded..24c117a534209 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -318,13 +318,13 @@ As usual, **both sides** of the slicers are included as this is label indexing. .. code-block:: python - df.loc[(slice('A1','A3'),.....), :] + df.loc[(slice('A1', 'A3'), ...), :] # noqa: E999   You should **not** do this:   .. code-block:: python - df.loc[(slice('A1','A3'),.....)] + df.loc[(slice('A1', 'A3'), ...)] # noqa: E999 .. ipython:: python @@ -532,7 +532,7 @@ used to move the values from the ``MultiIndex`` to a column. .. ipython:: python df.rename_axis(index=['abc', 'def']) - + Note that the columns of a ``DataFrame`` are an index, so that using ``rename_axis`` with the ``columns`` argument will change the name of that index. @@ -779,7 +779,7 @@ values **not** in the categories, similarly to how you can reindex **any** panda Reshaping and Comparison operations on a ``CategoricalIndex`` must have the same categories or a ``TypeError`` will be raised. - .. code-block:: python + .. code-block:: ipython In [9]: df3 = pd.DataFrame({'A' : np.arange(6), 'B' : pd.Series(list('aabbca')).astype('category')}) @@ -1071,7 +1071,7 @@ On the other hand, if the index is not monotonic, then both slice bounds must be # OK because 2 and 4 are in the index df.loc[2:4, :] -.. code-block:: python +.. code-block:: ipython # 0 is not in the index In [9]: df.loc[0:4, :] diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 81efbfd6d1403..d19fcedf4e766 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -306,8 +306,8 @@ To evaluate single-element pandas objects in a boolean context, use the method .. code-block:: python - >>> if df: - ... + >>> if df: # noqa: E999 + ... Or @@ -317,7 +317,7 @@ To evaluate single-element pandas objects in a boolean context, use the method These will both raise errors, as you are trying to compare multiple values. - .. code-block:: python + .. code-block:: python-traceback ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). @@ -732,9 +732,8 @@ with the equivalent .. code-block:: python >>> (df.pipe(h) - .pipe(g, arg1=1) - .pipe(f, arg2=2, arg3=3) - ) + ... .pipe(g, arg1=1) + ... .pipe(f, arg2=2, arg3=3)) Pandas encourages the second style, which is known as method chaining. ``pipe`` makes it easy to use your own or another library's functions diff --git a/doc/source/comparison_with_sas.rst b/doc/source/comparison_with_sas.rst index 4d7acdf9ab16c..318bffe44a81b 100644 --- a/doc/source/comparison_with_sas.rst +++ b/doc/source/comparison_with_sas.rst @@ -744,7 +744,7 @@ XPORT is a relatively limited format and the parsing of it is not as optimized as some of the other pandas readers. An alternative way to interop data between SAS and pandas is to serialize to csv. -.. code-block:: python +.. code-block:: ipython # version 0.17, 10M rows diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 3ec505998fde0..084f710091a1b 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -792,7 +792,7 @@ Transitioning to ``pytest`` .. code-block:: python class TestReallyCoolFeature(object): - .... + pass Going forward, we are moving to a more *functional* style using the `pytest `__ framework, which offers a richer testing framework that will facilitate testing and developing. Thus, instead of writing test classes, we will write test functions like this: @@ -800,7 +800,7 @@ framework that will facilitate testing and developing. Thus, instead of writing .. code-block:: python def test_really_cool_feature(): - .... + pass Using ``pytest`` ~~~~~~~~~~~~~~~~ @@ -825,25 +825,30 @@ We would name this file ``test_cool_feature.py`` and put in an appropriate place import pandas as pd from pandas.util import testing as tm + @pytest.mark.parametrize('dtype', ['int8', 'int16', 'int32', 'int64']) def test_dtypes(dtype): assert str(np.dtype(dtype)) == dtype - @pytest.mark.parametrize('dtype', ['float32', - pytest.param('int16', marks=pytest.mark.skip), - pytest.param('int32', - marks=pytest.mark.xfail(reason='to show how it works'))]) + + @pytest.mark.parametrize( + 'dtype', ['float32', pytest.param('int16', marks=pytest.mark.skip), + pytest.param('int32', marks=pytest.mark.xfail( + reason='to show how it works'))]) def test_mark(dtype): assert str(np.dtype(dtype)) == 'float32' + @pytest.fixture def series(): return pd.Series([1, 2, 3]) + @pytest.fixture(params=['int8', 'int16', 'int32', 'int64']) def dtype(request): return request.param + def test_series(series, dtype): result = series.astype(dtype) assert result.dtype == dtype @@ -912,6 +917,7 @@ for details `_. st.lists(any_json_value), st.dictionaries(st.text(), any_json_value) )) + @given(value=any_json_value) def test_json_roundtrip(value): result = json.loads(json.dumps(value)) diff --git a/doc/source/contributing_docstring.rst b/doc/source/contributing_docstring.rst index 38e4baa66ef67..2f8ffc2e07c71 100644 --- a/doc/source/contributing_docstring.rst +++ b/doc/source/contributing_docstring.rst @@ -197,6 +197,8 @@ infinitive verb. """ pass +.. code-block:: python + def astype(dtype): """ Method to cast Series type. @@ -205,6 +207,8 @@ infinitive verb. """ pass +.. code-block:: python + def astype(dtype): """ Cast Series type @@ -213,6 +217,8 @@ infinitive verb. """ pass +.. code-block:: python + def astype(dtype): """ Cast Series type from its current type to the new type defined in @@ -624,6 +630,7 @@ A simple example could be: .. code-block:: python class Series: + def head(self, n=5): """ Return the first elements of the Series. @@ -681,12 +688,11 @@ shown: .. code-block:: python - import numpy as np - import pandas as pd - + import numpy as np # noqa: F401 + import pandas as pd # noqa: F401 Any other module used in the examples must be explicitly imported, one per line (as -recommended in `PEP-8 `_) +recommended in :pep:`8#imports`) and avoiding aliases. Avoid excessive imports, but if needed, imports from the standard library go first, followed by third-party libraries (like matplotlib). @@ -720,6 +726,7 @@ positional arguments ``head(3)``. .. code-block:: python class Series: + def mean(self): """ Compute the mean of the input. @@ -946,12 +953,14 @@ substitute the children's class names in this docstring. """Apply my function to %(klass)s.""" ... + class ChildA(Parent): @Substitution(klass="ChildA") @Appender(Parent.my_function.__doc__) def my_function(self): ... + class ChildB(Parent): @Substitution(klass="ChildB") @Appender(Parent.my_function.__doc__) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 3d26a9c7d3d54..53468e755a722 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -968,7 +968,7 @@ Parsing date components in multi-columns Parsing date components in multi-columns is faster with a format -.. code-block:: python +.. code-block:: ipython In [30]: i = pd.date_range('20000101',periods=10000) @@ -1266,6 +1266,7 @@ The `method` argument within `DataFrame.corr` can accept a callable in addition ... ... return cov_ab / std_a / std_b ... + ... >>> df = pd.DataFrame(np.random.normal(size=(100, 3))) ... >>> df.corr(method=distcorr) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index d02912294060c..b55f93566c03d 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -566,13 +566,12 @@ To write code compatible with all versions of Python, split the assignment in tw .. code-block:: python >>> dependent = pd.DataFrame({"A": [1, 1, 1]}) - >>> dependent.assign(A=lambda x: x["A"] + 1, - B=lambda x: x["A"] + 2) + >>> dependent.assign(A=lambda x: x["A"] + 1, B=lambda x: x["A"] + 2) For Python 3.5 and earlier the expression creating ``B`` refers to the "old" value of ``A``, ``[1, 1, 1]``. The output is then - .. code-block:: python + .. code-block:: console A B 0 2 3 @@ -582,7 +581,7 @@ To write code compatible with all versions of Python, split the assignment in tw For Python 3.6 and later, the expression creating ``A`` refers to the "new" value of ``A``, ``[2, 2, 2]``, which results in - .. code-block:: python + .. code-block:: console A B 0 2 4 diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 8f8a9fe3e50e0..2ca8a2b7ac0f8 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -298,7 +298,7 @@ advanced Cython techniques: Even faster, with the caveat that a bug in our Cython code (an off-by-one error, for example) might cause a segfault because memory access isn't checked. -For more about ``boundscheck`` and ``wraparound``, see the Cython docs on +For more about ``boundscheck`` and ``wraparound``, see the Cython docs on `compiler directives `__. .. _enhancingperf.numba: @@ -323,39 +323,45 @@ Numba works by generating optimized machine code using the LLVM compiler infrast Jit ~~~ -We demonstrate how to use Numba to just-in-time compile our code. We simply +We demonstrate how to use Numba to just-in-time compile our code. We simply take the plain Python code from above and annotate with the ``@jit`` decorator. .. code-block:: python import numba + @numba.jit def f_plain(x): - return x * (x - 1) + return x * (x - 1) + @numba.jit def integrate_f_numba(a, b, N): - s = 0 - dx = (b - a) / N - for i in range(N): - s += f_plain(a + i * dx) - return s * dx + s = 0 + dx = (b - a) / N + for i in range(N): + s += f_plain(a + i * dx) + return s * dx + @numba.jit def apply_integrate_f_numba(col_a, col_b, col_N): - n = len(col_N) - result = np.empty(n, dtype='float64') - assert len(col_a) == len(col_b) == n - for i in range(n): - result[i] = integrate_f_numba(col_a[i], col_b[i], col_N[i]) - return result + n = len(col_N) + result = np.empty(n, dtype='float64') + assert len(col_a) == len(col_b) == n + for i in range(n): + result[i] = integrate_f_numba(col_a[i], col_b[i], col_N[i]) + return result + def compute_numba(df): - result = apply_integrate_f_numba(df['a'].values, df['b'].values, df['N'].values) - return pd.Series(result, index=df.index, name='result') + result = apply_integrate_f_numba(df['a'].values, df['b'].values, + df['N'].values) + return pd.Series(result, index=df.index, name='result') -Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a nicer interface by passing/returning pandas objects. +Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a +nicer interface by passing/returning pandas objects. .. code-block:: ipython @@ -375,13 +381,16 @@ Consider the following toy example of doubling each observation: import numba + def double_every_value_nonumba(x): - return x*2 + return x * 2 + @numba.vectorize def double_every_value_withnumba(x): - return x*2 + return x * 2 +.. code-block:: ipython # Custom function without numba In [5]: %timeit df['col1_doubled'] = df.a.apply(double_every_value_nonumba) @@ -402,18 +411,18 @@ Caveats Numba will execute on any function, but can only accelerate certain classes of functions. -Numba is best at accelerating functions that apply numerical functions to NumPy -arrays. When passed a function that only uses operations it knows how to +Numba is best at accelerating functions that apply numerical functions to NumPy +arrays. When passed a function that only uses operations it knows how to accelerate, it will execute in ``nopython`` mode. -If Numba is passed a function that includes something it doesn't know how to -work with -- a category that currently includes sets, lists, dictionaries, or -string functions -- it will revert to ``object mode``. In ``object mode``, -Numba will execute but your code will not speed up significantly. If you would -prefer that Numba throw an error if it cannot compile a function in a way that -speeds up your code, pass Numba the argument -``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on -troubleshooting Numba modes, see the `Numba troubleshooting page +If Numba is passed a function that includes something it doesn't know how to +work with -- a category that currently includes sets, lists, dictionaries, or +string functions -- it will revert to ``object mode``. In ``object mode``, +Numba will execute but your code will not speed up significantly. If you would +prefer that Numba throw an error if it cannot compile a function in a way that +speeds up your code, pass Numba the argument +``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on +troubleshooting Numba modes, see the `Numba troubleshooting page `__. Read more in the `Numba docs `__. diff --git a/doc/source/extending.rst b/doc/source/extending.rst index 1e8a8e50dd9e3..6c47d0ae8bd84 100644 --- a/doc/source/extending.rst +++ b/doc/source/extending.rst @@ -163,6 +163,7 @@ your ``MyExtensionArray`` class, as follows: class MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): pass + MyExtensionArray._add_arithmetic_ops() MyExtensionArray._add_comparison_ops() @@ -205,6 +206,7 @@ To use a test, subclass it: from pandas.tests.extension import base + class TestConstructors(base.BaseConstructorsTests): pass @@ -277,6 +279,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame def _constructor_expanddim(self): return SubclassedDataFrame + class SubclassedDataFrame(DataFrame): @property @@ -297,7 +300,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(to_framed) - >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) >>> df A B C 0 1 4 7 @@ -313,6 +316,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame 0 1 4 1 2 5 2 3 6 + >>> type(sliced1) @@ -322,6 +326,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame 1 2 2 3 Name: A, dtype: int64 + >>> type(sliced2) diff --git a/doc/source/gotchas.rst b/doc/source/gotchas.rst index 79e312ca12833..0eb2a4eed8581 100644 --- a/doc/source/gotchas.rst +++ b/doc/source/gotchas.rst @@ -98,7 +98,7 @@ of the following code should be: .. code-block:: python - >>> if pd.Series([False, True, False]): + >>> if pd.Series([False, True, False]): # noqa: E999 ... Should it be ``True`` because it's not zero-length, or ``False`` because there @@ -107,7 +107,7 @@ are ``False`` values? It is unclear, so instead, pandas raises a ``ValueError``: .. code-block:: python >>> if pd.Series([False, True, False]): - print("I was true") + ... print("I was true") Traceback ... ValueError: The truth value of an array is ambiguous. Use a.empty, a.any() or a.all(). @@ -119,8 +119,8 @@ Alternatively, you might want to compare if the pandas object is ``None``: .. code-block:: python >>> if pd.Series([False, True, False]) is not None: - print("I was not None") - >>> I was not None + ... print("I was not None") + I was not None Below is how to check if any of the values are ``True``: @@ -128,8 +128,8 @@ Below is how to check if any of the values are ``True``: .. code-block:: python >>> if pd.Series([False, True, False]).any(): - print("I am any") - >>> I am any + ... print("I am any") + I am any To evaluate single-element pandas objects in a boolean context, use the method :meth:`~DataFrame.bool`: @@ -316,7 +316,7 @@ Occasionally you may have to deal with data that were created on a machine with a different byte order than the one on which you are running Python. A common symptom of this issue is an error like: -.. code-block:: python +.. code-block:: python-traceback Traceback ... diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 755edba352f05..fb96afaf7d796 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -79,7 +79,7 @@ pandas objects can be split on any of their axes. The abstract definition of grouping is to provide a mapping of labels to group names. To create a GroupBy object (more on what the GroupBy object is later), you may do the following: -.. code-block:: ipython +.. code-block:: python # default is axis=0 >>> grouped = obj.groupby(key) @@ -1310,7 +1310,7 @@ arbitrary function, for example: .. code-block:: python - (df.groupby(['Store', 'Product']).pipe(report_func) + df.groupby(['Store', 'Product']).pipe(report_func) where ``report_func`` takes a GroupBy object and creates a report from that. diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 1c63acce6e3fa..5740ab5fa6921 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -537,10 +537,10 @@ A list of indexers where any element is out of bounds will raise an .. code-block:: python - dfl.iloc[[4, 5, 6]] + >>> dfl.iloc[[4, 5, 6]] IndexError: positional indexers are out-of-bounds - dfl.iloc[:, 4] + >>> dfl.iloc[:, 4] IndexError: single positional indexer is out-of-bounds .. _indexing.callable: @@ -1794,7 +1794,7 @@ interpreter executes this code: .. code-block:: python - dfmi.loc[:,('one','second')] = value + dfmi.loc[:, ('one', 'second')] = value # becomes dfmi.loc.__setitem__((slice(None), ('one', 'second')), value) @@ -1827,10 +1827,10 @@ that you've done this: .. code-block:: python def do_something(df): - foo = df[['bar', 'baz']] # Is foo a view? A copy? Nobody knows! - # ... many lines here ... - foo['quux'] = value # We don't know whether this will modify df or not! - return foo + foo = df[['bar', 'baz']] # Is foo a view? A copy? Nobody knows! + # ... many lines here ... + foo['quux'] = value # We don't know whether this will modify df or not! + return foo Yikes! diff --git a/doc/source/io.rst b/doc/source/io.rst index 9f458b58717d6..0acb0dfbee2d7 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1834,8 +1834,7 @@ For example: .. code-block:: python - DataFrame([1.0, 2.0, complex(1.0, 2.0)]).to_json() # raises - + >>> DataFrame([1.0, 2.0, complex(1.0, 2.0)]).to_json() # raises RuntimeError: Unhandled numpy dtype 15 can be dealt with by specifying a simple ``default_handler``: @@ -2411,8 +2410,8 @@ columns to strings. .. code-block:: python url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code' - dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0, converters={'MNC': - str}) + dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0, + converters={'MNC': str}) .. versionadded:: 0.19 @@ -2724,7 +2723,8 @@ different parameters: data = {} # For when Sheet1's format differs from Sheet2 with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, na_values=['NA']) + data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, + na_values=['NA']) data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=1) Note that if the same parsing parameters are used for all sheets, a list @@ -2735,11 +2735,14 @@ of sheet names can simply be passed to ``read_excel`` with no loss in performanc # using the ExcelFile class data = {} with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = read_excel(xls, 'Sheet1', index_col=None, na_values=['NA']) - data['Sheet2'] = read_excel(xls, 'Sheet2', index_col=None, na_values=['NA']) + data['Sheet1'] = read_excel(xls, 'Sheet1', index_col=None, + na_values=['NA']) + data['Sheet2'] = read_excel(xls, 'Sheet2', index_col=None, + na_values=['NA']) # equivalent using the read_excel function - data = read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], index_col=None, na_values=['NA']) + data = read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], + index_col=None, na_values=['NA']) .. _io.excel.specifying_sheets: @@ -2899,7 +2902,10 @@ missing data to recover integer dtype: .. code-block:: python - cfun = lambda x: int(x) if x else -1 + def cfun(x): + return int(x) if x else -1 + + read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) dtype Specifications @@ -3040,7 +3046,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: writer = ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') # Or via pandas configuration. - from pandas import options + from pandas import options # noqa: E402 options.io.excel.xlsx.writer = 'xlsxwriter' df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') @@ -3067,7 +3073,7 @@ which takes the contents of the clipboard buffer and passes them to the ``read_csv`` method. For instance, you can copy the following text to the clipboard (CTRL-C on many operating systems): -.. code-block:: python +.. code-block:: console A B C x 1 4 p @@ -3476,9 +3482,8 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for .. code-block:: python - pd.DataFrame(randn(10, 2)).to_hdf('test_fixed.h5', 'df') - - pd.read_hdf('test_fixed.h5', 'df', where='index>5') + >>> pd.DataFrame(randn(10, 2)).to_hdf('test_fixed.h5', 'df') + >>> pd.read_hdf('test_fixed.h5', 'df', where='index>5') TypeError: cannot pass a where specification when reading a fixed format. this store must be selected in its entirety @@ -3574,7 +3579,7 @@ will yield a tuple for each group key along with the relative keys of its conten Hierarchical keys cannot be retrieved as dotted (attribute) access as described above for items stored under the root node. - .. code-block:: python + .. code-block:: ipython In [8]: store.foo.bar.bah AttributeError: 'HDFStore' object has no attribute 'foo' @@ -3732,10 +3737,10 @@ The right-hand side of the sub-expression (after a comparison operator) can be: instead of this - .. code-block:: python + .. code-block:: ipython string = "HolyMoly'" - store.select('df', 'index == %s' % string) + store.select('df', 'index == %s' % string) The latter will **not** work and will raise a ``SyntaxError``.Note that there's a single quote followed by a double quote in the ``string`` @@ -3941,7 +3946,7 @@ The default is 50,000 rows returned in a chunk. .. code-block:: python - for df in pd.read_hdf('store.h5','df', chunksize=3): + for df in pd.read_hdf('store.h5', 'df', chunksize=3): print(df) Note, that the chunksize keyword applies to the **source** rows. So if you @@ -4871,7 +4876,8 @@ to pass to :func:`pandas.to_datetime`: .. code-block:: python pd.read_sql_table('data', engine, parse_dates={'Date': '%Y-%m-%d'}) - pd.read_sql_table('data', engine, parse_dates={'Date': {'format': '%Y-%m-%d %H:%M:%S'}}) + pd.read_sql_table('data', engine, + parse_dates={'Date': {'format': '%Y-%m-%d %H:%M:%S'}}) You can check if a table exists using :func:`~pandas.io.sql.has_table` @@ -5374,11 +5380,11 @@ And here's the code: import pandas as pd import sqlite3 from numpy.random import randn - from pandas.io import sql sz = 1000000 df = pd.DataFrame({'A': randn(sz), 'B': [1] * sz}) + def test_sql_write(df): if os.path.exists('test.sql'): os.remove('test.sql') @@ -5386,55 +5392,73 @@ And here's the code: df.to_sql(name='test_table', con=sql_db) sql_db.close() + def test_sql_read(): sql_db = sqlite3.connect('test.sql') pd.read_sql_query("select * from test_table", sql_db) sql_db.close() + def test_hdf_fixed_write(df): df.to_hdf('test_fixed.hdf', 'test', mode='w') + def test_hdf_fixed_read(): pd.read_hdf('test_fixed.hdf', 'test') + def test_hdf_fixed_write_compress(df): df.to_hdf('test_fixed_compress.hdf', 'test', mode='w', complib='blosc') + def test_hdf_fixed_read_compress(): pd.read_hdf('test_fixed_compress.hdf', 'test') + def test_hdf_table_write(df): df.to_hdf('test_table.hdf', 'test', mode='w', format='table') + def test_hdf_table_read(): pd.read_hdf('test_table.hdf', 'test') + def test_hdf_table_write_compress(df): - df.to_hdf('test_table_compress.hdf', 'test', mode='w', complib='blosc', format='table') + df.to_hdf('test_table_compress.hdf', 'test', mode='w', + complib='blosc', format='table') + def test_hdf_table_read_compress(): pd.read_hdf('test_table_compress.hdf', 'test') + def test_csv_write(df): df.to_csv('test.csv', mode='w') + def test_csv_read(): pd.read_csv('test.csv', index_col=0) + def test_feather_write(df): df.to_feather('test.feather') + def test_feather_read(): pd.read_feather('test.feather') + def test_pickle_write(df): df.to_pickle('test.pkl') + def test_pickle_read(): pd.read_pickle('test.pkl') + def test_pickle_write_compress(df): df.to_pickle('test.pkl.compress', compression='xz') + def test_pickle_read_compress(): pd.read_pickle('test.pkl.compress', compression='xz') diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index e4b5578af15f0..4864637691607 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -696,9 +696,8 @@ You can also operate on the DataFrame in place: .. code-block:: python - s = pd.Series([True, False, True]) - s.replace({'a string': 'new value', True: False}) # raises - + >>> s = pd.Series([True, False, True]) + >>> s.replace({'a string': 'new value', True: False}) # raises TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' will raise a ``TypeError`` because one of the ``dict`` keys is not of the @@ -728,7 +727,7 @@ rules introduced in the table below. :header: "data type", "Cast to" :widths: 40, 40 - integer, float + integer, float boolean, object float, no cast object, no cast diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 7d9925d800441..6163b6f2ae89a 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -45,13 +45,19 @@ For the curious here is how the above ``DataFrame`` was created: .. code-block:: python - import pandas.util.testing as tm; tm.N = 3 + import pandas.util.testing as tm + + tm.N = 3 + + def unpivot(frame): N, K = frame.shape - data = {'value' : frame.values.ravel('F'), - 'variable' : np.asarray(frame.columns).repeat(N), - 'date' : np.tile(np.asarray(frame.index), K)} + data = {'value': frame.values.ravel('F'), + 'variable': np.asarray(frame.columns).repeat(N), + 'date': np.tile(np.asarray(frame.index), K)} return pd.DataFrame(data, columns=['date', 'variable', 'value']) + + df = unpivot(tm.makeTimeDataFrame()) To select out everything for variable ``A`` we could do: diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index a52c80106f100..42fd356bbe65a 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -898,7 +898,7 @@ custom date increment logic, such as adding business days: .. code-block:: python class BDay(DateOffset): - """DateOffset increments between business days""" + """DateOffset increments between business days""" def apply(self, other): ... @@ -2133,7 +2133,8 @@ To convert from an ``int64`` based YYYYMMDD representation. s def conv(x): - return pd.Period(year = x // 10000, month = x//100 % 100, day = x%100, freq='D') + return pd.Period(year=x // 10000, month=x // 100 % 100, + day=x % 100, freq='D') s.apply(conv) s.apply(conv)[2] diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index bc91372e3ac7d..6694737737562 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -204,7 +204,8 @@ def decorator(accessor): .. code-block:: python - def __init__(self, pandas_object): + def __init__(self, pandas_object): # noqa: E999 + ... For consistency with pandas methods, you should raise an ``AttributeError`` if the data passed to your accessor has an incorrect dtype. diff --git a/pandas/core/missing.py b/pandas/core/missing.py index b2daec327d618..222873cd7f81a 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -760,9 +760,10 @@ def _interp_limit(invalid, fw_limit, bw_limit): .. code-block:: python - for x in np.where(invalid)[0]: - if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): - yield x + def _interp_limit(invalid, fw_limit, bw_limit): + for x in np.where(invalid)[0]: + if invalid[max(0, x - fw_limit):x + bw_limit + 1].all(): + yield x """ # handle forward first; the backward direction is the same except # 1. operate on the reversed array diff --git a/setup.cfg b/setup.cfg index 17b88d084ebf6..4726a0ddb2fb2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,12 @@ exclude = versioneer.py, env # exclude asv benchmark environments from linting +[flake8-rst] +ignore = + F821, # undefined name + W391, # blank line at end of file [Seems to be a bug (v0.4.1)] + + [yapf] based_on_style = pep8 split_before_named_assigns = false