Skip to content

Commit

Permalink
Merge branch 'master' into excel-tables-pandas-dev#24862
Browse files Browse the repository at this point in the history
  • Loading branch information
tdamsma committed Jan 27, 2019
2 parents 32f10e5 + 2b16e2e commit 5f9d664
Show file tree
Hide file tree
Showing 90 changed files with 3,303 additions and 1,635 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,14 @@ asv_bench/pandas/
# Documentation generated files #
#################################
doc/source/generated
doc/source/api/generated
doc/source/user_guide/styled.xlsx
doc/source/reference/api
doc/source/_static
doc/source/vbench
doc/source/vbench.rst
doc/source/index.rst
doc/build/html/index.html
# Windows specific leftover:
doc/tmp.sv
doc/source/styled.xlsx
env/
doc/source/savefig/
19 changes: 13 additions & 6 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,12 +223,19 @@ class CategoricalSlicing(object):

def setup(self, index):
N = 10**6
values = list('a' * N + 'b' * N + 'c' * N)
indices = {
'monotonic_incr': pd.Categorical(values),
'monotonic_decr': pd.Categorical(reversed(values)),
'non_monotonic': pd.Categorical(list('abc' * N))}
self.data = indices[index]
categories = ['a', 'b', 'c']
values = [0] * N + [1] * N + [2] * N
if index == 'monotonic_incr':
self.data = pd.Categorical.from_codes(values,
categories=categories)
elif index == 'monotonic_decr':
self.data = pd.Categorical.from_codes(list(reversed(values)),
categories=categories)
elif index == 'non_monotonic':
self.data = pd.Categorical.from_codes([0, 1, 2] * N,
categories=categories)
else:
raise ValueError('Invalid index param: {}'.format(index))

self.scalar = 10000
self.list = list(range(10000))
Expand Down
2 changes: 1 addition & 1 deletion asv_bench/benchmarks/ctors.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class SeriesDtypesConstructors(object):

def setup(self):
N = 10**4
self.arr = np.random.randn(N, N)
self.arr = np.random.randn(N)
self.arr_str = np.array(['foo', 'bar', 'baz'], dtype=object)
self.s = Series([Timestamp('20110101'), Timestamp('20120101'),
Timestamp('20130101')] * N * 10)
Expand Down
3 changes: 2 additions & 1 deletion asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ def setup(self, dtype):
self.sorted = self.idx.sort_values()
half = N // 2
self.non_unique = self.idx[:half].append(self.idx[:half])
self.non_unique_sorted = self.sorted[:half].append(self.sorted[:half])
self.non_unique_sorted = (self.sorted[:half].append(self.sorted[:half])
.sort_values())
self.key = self.sorted[N // 4]

def time_boolean_array(self, dtype):
Expand Down
8 changes: 3 additions & 5 deletions doc/make.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def __init__(self, num_jobs=0, include_api=True, single_doc=None,
if single_doc and single_doc.endswith('.rst'):
self.single_doc_html = os.path.splitext(single_doc)[0] + '.html'
elif single_doc:
self.single_doc_html = 'api/generated/pandas.{}.html'.format(
self.single_doc_html = 'reference/api/pandas.{}.html'.format(
single_doc)

def _process_single_doc(self, single_doc):
Expand All @@ -63,7 +63,7 @@ def _process_single_doc(self, single_doc):
For example, categorial.rst or pandas.DataFrame.head. For the latter,
return the corresponding file path
(e.g. generated/pandas.DataFrame.head.rst).
(e.g. reference/api/pandas.DataFrame.head.rst).
"""
base_name, extension = os.path.splitext(single_doc)
if extension in ('.rst', '.ipynb'):
Expand Down Expand Up @@ -121,8 +121,6 @@ def _sphinx_build(self, kind):
raise ValueError('kind must be html or latex, '
'not {}'.format(kind))

self.clean()

cmd = ['sphinx-build', '-b', kind]
if self.num_jobs:
cmd += ['-j', str(self.num_jobs)]
Expand Down Expand Up @@ -260,7 +258,7 @@ def clean():
Clean documentation generated files.
"""
shutil.rmtree(BUILD_PATH, ignore_errors=True)
shutil.rmtree(os.path.join(SOURCE_PATH, 'api', 'generated'),
shutil.rmtree(os.path.join(SOURCE_PATH, 'reference', 'api'),
ignore_errors=True)

def zip_html(self):
Expand Down
1,544 changes: 1,544 additions & 0 deletions doc/redirects.csv

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
15 changes: 15 additions & 0 deletions doc/source/getting_started/comparison/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{{ header }}

.. _comparison:

===========================
Comparison with other tools
===========================

.. toctree::
:maxdepth: 2

comparison_with_r
comparison_with_sql
comparison_with_sas
comparison_with_stata
1 change: 1 addition & 0 deletions doc/source/getting_started/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@ Getting started
10min
basics
dsintro
comparison/index
tutorials
93 changes: 74 additions & 19 deletions doc/source/getting_started/overview.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,80 @@
Package overview
****************

:mod:`pandas` is an open source, BSD-licensed library providing high-performance,
easy-to-use data structures and data analysis tools for the `Python <https://www.python.org/>`__
programming language.

:mod:`pandas` consists of the following elements:

* A set of labeled array data structures, the primary of which are
Series and DataFrame.
* Index objects enabling both simple axis indexing and multi-level /
hierarchical axis indexing.
* An integrated group by engine for aggregating and transforming data sets.
* Date range generation (date_range) and custom date offsets enabling the
implementation of customized frequencies.
* Input/Output tools: loading tabular data from flat files (CSV, delimited,
Excel 2003), and saving and loading pandas objects from the fast and
efficient PyTables/HDF5 format.
* Memory-efficient "sparse" versions of the standard data structures for storing
data that is mostly missing or mostly constant (some fixed value).
* Moving window statistics (rolling mean, rolling standard deviation, etc.).
**pandas** is a `Python <https://www.python.org>`__ package providing fast,
flexible, and expressive data structures designed to make working with
"relational" or "labeled" data both easy and intuitive. It aims to be the
fundamental high-level building block for doing practical, **real world** data
analysis in Python. Additionally, it has the broader goal of becoming **the
most powerful and flexible open source data analysis / manipulation tool
available in any language**. It is already well on its way toward this goal.

pandas is well suited for many different kinds of data:

- Tabular data with heterogeneously-typed columns, as in an SQL table or
Excel spreadsheet
- Ordered and unordered (not necessarily fixed-frequency) time series data.
- Arbitrary matrix data (homogeneously typed or heterogeneous) with row and
column labels
- Any other form of observational / statistical data sets. The data actually
need not be labeled at all to be placed into a pandas data structure

The two primary data structures of pandas, :class:`Series` (1-dimensional)
and :class:`DataFrame` (2-dimensional), handle the vast majority of typical use
cases in finance, statistics, social science, and many areas of
engineering. For R users, :class:`DataFrame` provides everything that R's
``data.frame`` provides and much more. pandas is built on top of `NumPy
<https://www.numpy.org>`__ and is intended to integrate well within a scientific
computing environment with many other 3rd party libraries.

Here are just a few of the things that pandas does well:

- Easy handling of **missing data** (represented as NaN) in floating point as
well as non-floating point data
- Size mutability: columns can be **inserted and deleted** from DataFrame and
higher dimensional objects
- Automatic and explicit **data alignment**: objects can be explicitly
aligned to a set of labels, or the user can simply ignore the labels and
let `Series`, `DataFrame`, etc. automatically align the data for you in
computations
- Powerful, flexible **group by** functionality to perform
split-apply-combine operations on data sets, for both aggregating and
transforming data
- Make it **easy to convert** ragged, differently-indexed data in other
Python and NumPy data structures into DataFrame objects
- Intelligent label-based **slicing**, **fancy indexing**, and **subsetting**
of large data sets
- Intuitive **merging** and **joining** data sets
- Flexible **reshaping** and pivoting of data sets
- **Hierarchical** labeling of axes (possible to have multiple labels per
tick)
- Robust IO tools for loading data from **flat files** (CSV and delimited),
Excel files, databases, and saving / loading data from the ultrafast **HDF5
format**
- **Time series**-specific functionality: date range generation and frequency
conversion, moving window statistics, moving window linear regressions,
date shifting and lagging, etc.

Many of these principles are here to address the shortcomings frequently
experienced using other languages / scientific research environments. For data
scientists, working with data is typically divided into multiple stages:
munging and cleaning data, analyzing / modeling it, then organizing the results
of the analysis into a form suitable for plotting or tabular display. pandas
is the ideal tool for all of these tasks.

Some other notes

- pandas is **fast**. Many of the low-level algorithmic bits have been
extensively tweaked in `Cython <https://cython.org>`__ code. However, as with
anything else generalization usually sacrifices performance. So if you focus
on one feature for your application you may be able to create a faster
specialized tool.

- pandas is a dependency of `statsmodels
<https://www.statsmodels.org/stable/index.html>`__, making it an important part of the
statistical computing ecosystem in Python.

- pandas has been used extensively in production in financial applications.

Data Structures
---------------
Expand Down
125 changes: 18 additions & 107 deletions doc/source/index.rst.template
Original file line number Diff line number Diff line change
@@ -1,141 +1,52 @@
.. pandas documentation master file, created by

.. module:: pandas

*********************************************
pandas: powerful Python data analysis toolkit
*********************************************

`PDF Version <pandas.pdf>`__

`Zipped HTML <pandas.zip>`__

.. module:: pandas

**Date**: |today| **Version**: |version|

**Binary Installers:** https://pypi.org/project/pandas

**Source Repository:** https://github.com/pandas-dev/pandas

**Issues & Ideas:** https://github.com/pandas-dev/pandas/issues

**Q&A Support:** https://stackoverflow.com/questions/tagged/pandas

**Developer Mailing List:** https://groups.google.com/forum/#!forum/pydata

**pandas** is a `Python <https://www.python.org>`__ package providing fast,
flexible, and expressive data structures designed to make working with
"relational" or "labeled" data both easy and intuitive. It aims to be the
fundamental high-level building block for doing practical, **real world** data
analysis in Python. Additionally, it has the broader goal of becoming **the
most powerful and flexible open source data analysis / manipulation tool
available in any language**. It is already well on its way toward this goal.

pandas is well suited for many different kinds of data:

- Tabular data with heterogeneously-typed columns, as in an SQL table or
Excel spreadsheet
- Ordered and unordered (not necessarily fixed-frequency) time series data.
- Arbitrary matrix data (homogeneously typed or heterogeneous) with row and
column labels
- Any other form of observational / statistical data sets. The data actually
need not be labeled at all to be placed into a pandas data structure

The two primary data structures of pandas, :class:`Series` (1-dimensional)
and :class:`DataFrame` (2-dimensional), handle the vast majority of typical use
cases in finance, statistics, social science, and many areas of
engineering. For R users, :class:`DataFrame` provides everything that R's
``data.frame`` provides and much more. pandas is built on top of `NumPy
<https://www.numpy.org>`__ and is intended to integrate well within a scientific
computing environment with many other 3rd party libraries.

Here are just a few of the things that pandas does well:

- Easy handling of **missing data** (represented as NaN) in floating point as
well as non-floating point data
- Size mutability: columns can be **inserted and deleted** from DataFrame and
higher dimensional objects
- Automatic and explicit **data alignment**: objects can be explicitly
aligned to a set of labels, or the user can simply ignore the labels and
let `Series`, `DataFrame`, etc. automatically align the data for you in
computations
- Powerful, flexible **group by** functionality to perform
split-apply-combine operations on data sets, for both aggregating and
transforming data
- Make it **easy to convert** ragged, differently-indexed data in other
Python and NumPy data structures into DataFrame objects
- Intelligent label-based **slicing**, **fancy indexing**, and **subsetting**
of large data sets
- Intuitive **merging** and **joining** data sets
- Flexible **reshaping** and pivoting of data sets
- **Hierarchical** labeling of axes (possible to have multiple labels per
tick)
- Robust IO tools for loading data from **flat files** (CSV and delimited),
Excel files, databases, and saving / loading data from the ultrafast **HDF5
format**
- **Time series**-specific functionality: date range generation and frequency
conversion, moving window statistics, moving window linear regressions,
date shifting and lagging, etc.

Many of these principles are here to address the shortcomings frequently
experienced using other languages / scientific research environments. For data
scientists, working with data is typically divided into multiple stages:
munging and cleaning data, analyzing / modeling it, then organizing the results
of the analysis into a form suitable for plotting or tabular display. pandas
is the ideal tool for all of these tasks.

Some other notes

- pandas is **fast**. Many of the low-level algorithmic bits have been
extensively tweaked in `Cython <https://cython.org>`__ code. However, as with
anything else generalization usually sacrifices performance. So if you focus
on one feature for your application you may be able to create a faster
specialized tool.

- pandas is a dependency of `statsmodels
<https://www.statsmodels.org/stable/index.html>`__, making it an important part of the
statistical computing ecosystem in Python.

- pandas has been used extensively in production in financial applications.

.. note::
**Download documentation**: `PDF Version <pandas.pdf>`__ | `Zipped HTML <pandas.zip>`__

This documentation assumes general familiarity with NumPy. If you haven't
used NumPy much or at all, do invest some time in `learning about NumPy
<https://docs.scipy.org>`__ first.
**Useful links**:
`Binary Installers <https://pypi.org/project/pandas>`__ |
`Source Repository <https://github.com/pandas-dev/pandas>`__ |
`Issues & Ideas <https://github.com/pandas-dev/pandas/issues>`__ |
`Q&A Support <https://stackoverflow.com/questions/tagged/pandas>`__ |
`Mailing List <https://groups.google.com/forum/#!forum/pydata>`__

See the package overview for more detail about what's in the library.
:mod:`pandas` is an open source, BSD-licensed library providing high-performance,
easy-to-use data structures and data analysis tools for the `Python <https://www.python.org/>`__
programming language.

See the :ref:`overview` for more detail about what's in the library.

{% if single_doc and single_doc.endswith('.rst') -%}
.. toctree::
:maxdepth: 4
:maxdepth: 2

{{ single_doc[:-4] }}
{% elif single_doc %}
.. autosummary::
:toctree: api/generated/
:toctree: reference/api/

{{ single_doc }}
{% else -%}
.. toctree::
:maxdepth: 4
:maxdepth: 2
{% endif %}

{% if not single_doc -%}
What's New <whatsnew/v0.24.0>
What's New in 0.25.0 <whatsnew/v0.25.0>
install
getting_started/index
cookbook
user_guide/index
r_interface
ecosystem
comparison_with_r
comparison_with_sql
comparison_with_sas
comparison_with_stata
{% endif -%}
{% if include_api -%}
api/index
reference/index
{% endif -%}
{% if not single_doc -%}
development/index
Expand Down
Loading

0 comments on commit 5f9d664

Please sign in to comment.