diff --git a/doc/pandas-2.0/.gitignore b/doc/pandas-2.0/.gitignore new file mode 100644 index 0000000000000..aeffbbed984ff --- /dev/null +++ b/doc/pandas-2.0/.gitignore @@ -0,0 +1,2 @@ +pandas2-design +_build diff --git a/doc/pandas-2.0/Makefile b/doc/pandas-2.0/Makefile new file mode 100644 index 0000000000000..654dda170fe37 --- /dev/null +++ b/doc/pandas-2.0/Makefile @@ -0,0 +1,233 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source + +.PHONY: help +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pandas20DesignDocs.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pandas20DesignDocs.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/pandas20DesignDocs" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pandas20DesignDocs" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." + +OUTPUTDIR=_build/html +DEPLOYREPOSITORY=pandas2-design + +deploy: html + if test -d $(OUTPUTDIR); \ + then echo " (build directory exists)"; \ + else mkdir -p $(OUTPUTDIR); \ + fi + if test -d $(DEPLOYREPOSITORY); \ + then echo " (repository directory exists)"; \ + else git clone git@github.com:wesm/$(DEPLOYREPOSITORY).git; \ + fi + cd $(DEPLOYREPOSITORY) && git pull + rsync -r $(OUTPUTDIR)/* $(DEPLOYREPOSITORY)/ + cd $(DEPLOYREPOSITORY) && git add . && git commit -m "deploy" + cd $(DEPLOYREPOSITORY) && git push origin gh-pages diff --git a/doc/pandas-2.0/index.rst b/doc/pandas-2.0/index.rst new file mode 100644 index 0000000000000..6775e45c84b1f --- /dev/null +++ b/doc/pandas-2.0/index.rst @@ -0,0 +1,22 @@ +.. pandas 2.0 Design Docs documentation master file, created by + sphinx-quickstart on Mon Aug 8 11:48:39 2016. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to pandas 2.0 Design Docs's documentation! +================================================== + +Contents: + +.. toctree:: + :maxdepth: 2 + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/doc/pandas-2.0/make.bat b/doc/pandas-2.0/make.bat new file mode 100644 index 0000000000000..4c22f56b72e9f --- /dev/null +++ b/doc/pandas-2.0/make.bat @@ -0,0 +1,263 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 1>NUL 2>NUL +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pandas20DesignDocs.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pandas20DesignDocs.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/doc/pandas-2.0/source/conf.py b/doc/pandas-2.0/source/conf.py new file mode 100644 index 0000000000000..071c8bd31ea6c --- /dev/null +++ b/doc/pandas-2.0/source/conf.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# pandas 2.0 Design Docs documentation build configuration file, created by +# sphinx-quickstart on Mon Aug 8 11:48:39 2016. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +#sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. + +extensions = ['IPython.sphinxext.ipython_directive', + 'IPython.sphinxext.ipython_console_highlighting'] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = "Wes's pandas 2.0 Design Docs" +copyright = '2016, Wes McKinney' +author = 'Wes McKinney' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1' +# The full version, including alpha/beta/rc tags. +release = '0.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +import sphinx_rtd_theme + +html_theme = "sphinx_rtd_theme" + +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pandas20DesignDocsdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'pandas20DesignDocs.tex', 'pandas 2.0 Design Docs Documentation', + 'Wes McKinney', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pandas20designdocs', 'pandas 2.0 Design Docs Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'pandas20DesignDocs', 'pandas 2.0 Design Docs Documentation', + author, 'pandas20DesignDocs', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/doc/pandas-2.0/source/copyonwrite.rst b/doc/pandas-2.0/source/copyonwrite.rst new file mode 100644 index 0000000000000..321614171c873 --- /dev/null +++ b/doc/pandas-2.0/source/copyonwrite.rst @@ -0,0 +1,5 @@ +.. _copyonwrite: + +================================== + View semantics and Copy-On-Write +================================== diff --git a/doc/pandas-2.0/source/goals.rst b/doc/pandas-2.0/source/goals.rst new file mode 100644 index 0000000000000..cc2ff6467e271 --- /dev/null +++ b/doc/pandas-2.0/source/goals.rst @@ -0,0 +1,194 @@ +.. _goals: + +======================= + Goals and Motivations +======================= + +.. note:: + + These documents are largely written by Wes McKinney, and at this point + reflect his opinions for the time being. Many things may change as we discuss + and work to reach a consensus about the path forward. + +The pandas codebase is now over 8 years old, having grown to over 200,000 lines +of code from its original ~10,000 LOC in the original 0.1 open source release +in January 2010. + +At a high level, the "pandas 2.0" effort is based on a number of observations: + +* The pandas 0.x series of releases have consisted with huge amounts of + iterative improvements to the library along with some major new features, bug + fixes, and improved documentation. There have also been a series of + deprecations, API changes, and other evolutions of pandas's API to account + for suboptimal design choices (for example: the ``.ix`` operator) made in the + early days of the project (2010 to 2012). +* The unification of Series and DataFrame internals to be based on a common + ``NDFrame`` base class and "block manager" data structure (originally created + by me in 2011, and heroically driven forward to its modern form by Jeff + Reback), while introducing many benefits to pandas, has come to be viewed as + a long-term source of technical debt and code complexity. +* pandas's ability to support an increasingly broad set of use cases has been + significantly constrained (as will be examined in detail in these documents) + by its tight coupling to NumPy and therefore subject to various limitations + in NumPy. +* Making significant functional additions (particularly filling gaps in NumPy) + to pandas, particularly new data types, has grown increasingly complex with + very obvious accumulations of technical debt. +* pandas is being used increasingly for very large datasets on machines with + many cores and large amounts of RAM (100s of gigabytes to terabytes). It + would be nice to be able to better utilize these larger, beefier systems + within a single Python process. +* pandas is being used increasingly as a computational building block of some + larger system, such as Dask or Apache Spark. We should consider reducing the + overhead for making data accessible to pandas (i.e. via memory-mapping or + other low-overhead memory sharing). +* Rough edges in pandas's implementation (e.g. its handling of missing data + across data types) are being exposed to users. + +These documents are largely concerned with pandas's internal design, which is +mostly invisible to average users. Advanced users of pandas are generally +familiar with some of these internal details, particular around performance and +memory use, and so the degree to which users are impacted will vary quite a +lot. + +Key areas of work +================= + +Possible changes or improvements to pandas's internals fall into a number of +different buckets to be explored in great detail: + +* **Decoupling from NumPy while preserving interoperability**: by eliminating + the presumption that pandas objects internally must contain data stored in + NumPy ``ndarray`` objects, we will be able to bring more consistency to + pandas's semantics and enable the core developers to extend pandas more + cleanly with new data types, data structures, and computational semantics. +* **Exposing a pandas Cython and/or C/C++ API to other Python library + developers**: the internals of Series and DataFrame are only weakly + accessible in other developers' native code. At minimum, we wish to better + enable developers to construct the precise data structures / memory + representation that fill the insides of Series and DataFrame. +* **Improving user control and visibility of memory use**: pandas's memory use, + as a result of its internal implementation, can frequently be opaque to the + user or outright unpredictable. +* **Improving performance and system utilization**: We aim to improve both the + micro (operations that take < 1 ms) and macro (all other operations) + performance of pandas across the board. As part of this, we aim to make it + easier for pandas's core developers to leverage multicore systems to + accelerate computations (without running into any of Python's well-known + concurrency limitations) +* **Removal of deprecated / underutilized functionality**: As the Python data + ecosystem has grown, a number of areas of pandas (e.g. plotting and datasets + with more than 2 dimensions) may be better served by other open source + projects. Also, functionality that has been explicitly deprecated or + discouraged from use (like the ``.ix`` indexing operator) would ideally be + removed. + +Non-goals / FAQ +=============== + +As this will be a quite nuanced discussion, especially for those not intimately +familiar with pandas's implementation details, I wanted to speak to a couple of +commonly-asked questions in brief: + +```` + +1. **Will this work make it harder to use pandas with NumPy, scikit-learn, + statsmodels, SciPy, or other libraries that depend on NumPy + interoperability?** + * We are not planning on it. Data that is representable without memory + copying or conversion in NumPy arrays will continue to be 100% + interoperable. + * Data containing missing (NA) values may require explicit conversion where + it is not currently required. For example: integer or boolean type arrays + with missing data. I trust this will be seen as a positive development. + * If anything, more performant and more precise data semantics in pandas will + generally make production code using a downstream library like scikit-learn + more dependable and future-proof. + +```` + +2. **By decoupling from NumPy, it sounds like you are reimplementing NumPy or + adding a new data type system** + + * Simply put: no. But it's more complicated than that because of the + numerous interpretations of "type system". + + * pandas already contains a large amount (10s of KLOCs) of custom + computational code (see, for example, + ``_) that implements + functionality not present in NumPy. + + * pandas already features its own (what I will describe as a) "logical type + system", including things like custom data types (such as that of + ``pandas.Categorical``), pandas-specific missing data representation, and + implicit type casting (e.g. integer to float on introduction of missing + data). Unfortunately, these logical data types are somewhat weakly + expressed, and the mix of NumPy dtype objects and custom pandas types is + problematic for many internal (implementation) and external (user API) + reasons. I will examine in detail the difference between **physical + types** (i.e. NumPy's dtypes) and **logical types** (i.e. what pandas + currently has, implicitly). + +```` + +3. **Shouldn't you try to accomplish your goals by contributing work to NumPy + instead of investing major work in pandas's internals?** + + * In my opinion, this is a "false dichotomy"; i.e. these things are not + mutually exclusive. + + * Yes, we should define, scope, and if possible help implement improvements + to NumPy that make sense. As NumPy serves a significantly larger and more + diverse set of users, major changes to the NumPy C codebase must be + approached more conservatively. + + * It is unclear that pandas's body of domain-specific data handling and + computational code is entirely "in scope" for NumPy. Some technical + details, such as our categorical or datetime data semantics, "group by" + functionality, relational algebra (joins), etc., may be ideal for pandas + but not necessarily ideal for a general user of NumPy. My opinion is that + functionality from NumPy we wish to use in pandas should "pass through" to + the user unmodified, but we must retain the flexibility to work "outside + the box" (implement things not found in NumPy) without adding technical + debt or user API complexity. + +```` + +4. **API changes / breaks are thought to be bad; don't you have a + responsibility to maintain backwards compatibility for users that heavily + depend on pandas?** + + * It's true that APIs should not be broken or changed, and as such should be + approached with extreme caution. + + * The goal of the pandas 2.0 initiative is to only make "good" API breaks + that yield a net benefit that can be easily demonstrated. As an example: + adding native missing data support to integer and boolean data (without + casting to another physical storage type) may break user code that has + knowledge of the "rough edge" (the behavior that we are fixing). As these + changes will mostly affect advanced pandas users, I expect they will be + welcomed. + + * Any major API change or break will be documented and justified to assist + with code migration. + + * As soon as we are able, we will post binary development artifacts for the + pandas 2.0 development branch to get early feedback from heavy pandas + users to understand the impact of changes and how we can better help the + existing user base. + + * Some users will find that a certain piece of code has been working "by + accident" (i.e. relying upon undocumented behavior). This kind of breakage + is already a routine occurrence unfortunately. + +Summary +======= + +Overall, the goal of the pandas 2.0 project is to yield a faster, more cleanly +architected, and more future-proof library that is a drop-in replacement for +90-95% of pandas user code. There will be API / code breakages, but the intent +of any code breakage will almost always be to fix something that has been +"wrong" or inconsistent. Many advanced users will have worked around some of +these rough edges, and so their workarounds may either need to be removed or +changed to accommodate the new (and hopefully it can be agreed in each case: +better) semantics. diff --git a/doc/pandas-2.0/source/index.rst b/doc/pandas-2.0/source/index.rst new file mode 100644 index 0000000000000..70a2c25bbf2b5 --- /dev/null +++ b/doc/pandas-2.0/source/index.rst @@ -0,0 +1,24 @@ +Wes's pandas 2.0 Design Documents +================================= + +These are a set of documents, based on discussions started in December 2015, to +assist with discussions around changes to Python pandas's internal design +intended to better accommodate the evolving needs of the growing Python data +userbase and to help ensure that pandas remains a relevant and important +project in the future. + +.. toctree:: + :maxdepth: 3 + + goals + internal-architecture + strings + copyonwrite + removals + +.. Indices and tables +.. ================== + +.. * :ref:`genindex` +.. * :ref:`modindex` +.. * :ref:`search` diff --git a/doc/pandas-2.0/source/internal-architecture.rst b/doc/pandas-2.0/source/internal-architecture.rst new file mode 100644 index 0000000000000..c3d38e2957aa3 --- /dev/null +++ b/doc/pandas-2.0/source/internal-architecture.rst @@ -0,0 +1,714 @@ +.. _internal-architecture: + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + np.set_printoptions(precision=4, suppress=True) + pd.options.display.max_rows = 100 + +=============================== + Internal Architecture Changes +=============================== + +Logical types and Physical Storage Decoupling +============================================= + +Since this is the most important, but perhaps also most controversial, change +(in my opinion) to pandas, I'm going to go over it in great detail. I think the +hardest part is coming up with clear language and definitions for concepts so +that we can communicate effectively. For example the term "data type" is vague +and may mean different things to different people. + +A motivating example +~~~~~~~~~~~~~~~~~~~~ + +Before digging too much into the technical details and problems/solutions, +let's look at some code examples. It is not unusual to find code like this in +pandas's internals: + +.. code-block:: python + + def create_from_value(value, index, dtype): + # return a new empty value suitable for the dtype + + if is_datetimetz(dtype): + subarr = DatetimeIndex([value] * len(index), dtype=dtype) + elif is_categorical_dtype(dtype): + subarr = Categorical([value] * len(index)) + else: + if not isinstance(dtype, (np.dtype, type(np.dtype))): + dtype = dtype.dtype + subarr = np.empty(len(index), dtype=dtype) + subarr.fill(value) + +or + +.. code-block:: python + + if is_categorical_dtype(dtype): + upcast_cls = 'category' + elif is_datetimetz(dtype): + upcast_cls = 'datetimetz' + elif issubclass(dtype.type, np.bool_): + upcast_cls = 'bool' + elif issubclass(dtype.type, np.object_): + upcast_cls = 'object' + elif is_datetime64_dtype(dtype): + upcast_cls = 'datetime' + elif is_timedelta64_dtype(dtype): + upcast_cls = 'timedelta' + else: + upcast_cls = 'float' + +I've cherry-picked one of a number of places where this type of datatype-based +branching happens. + +The primary reason for this complexity is that pandas is using both NumPy's +dtype objects (which describe *physical storage*) as well as its own custom +data type objects as a proxy for pandas's *semantic logical types*. + +Let's step back for a second and come up with clear language to steer the +discussion. + +Some definitions +~~~~~~~~~~~~~~~~ + +Here is my attempt at definitions of some of the key terms: + +* **Metadata**: data that describes other data (such as its in-memory layout) + +* **Semantics**: The meaning / abstract interpretation of something. We often + discuss the semantics (meaning) of computer programs (i.e. what they do, + fundamentally) without touching upon low level details like machine + representation, programming languages, compilers, operating systems, etc. + +* **Physical data (or storage) types**: these are metadata objects which + provide a description of the precise structure of a piece of data in memory. + + * In NumPy, the ``numpy.dtype`` object (aka ``PyArray_Descr`` in the C API) + is metadata describing a single cell / value in an array. Combined with the + ``shape`` and ``strides`` attributes of the ``ndarray`` object, you have + enough information to perform O(1) random access on any cell in an + ``ndarray`` and to assign these values to a C type (or, in the case, of + structured dtypes, assign to a packed C struct). + + * This may or may not include a physical representation of NULL or missing + data (for example: nullable float64 might be a physical type indicating a + normal float64 array along with a bitmap of null/not-null indicators). + +* **Logical data type**: metadata which describes the semantic content of a + single value in an array or other collection of values. Depending on the + logical type, it may map 1-to-1 to a physical type or not at all. Here are + some examples: + + * The ``double`` or ``float64`` type may be viewed both as a logical type as + well as a physical type (a 1-to-1 correspondence). + + * pandas's ``category`` dtype contains its own auxiliary array of category + values (for example, the distinct strings collected from a string + array). Based on the number of categories, the category ``codes`` (which + reference the categories array) are stored in the smallest possible integer + physical type (from ``int8`` to ``int64``, depending whether the data type + can accommodate the codes). For example, if there are 50 codes, the data is + represented in ``int8`` storage. For 1000 codes, it would be ``int16``. + + * Another example: timestamps may be physically stored in ``int64`` + storage, and these values are interpreted in the context of a particular + time unit or resolution (e.g. nanoseconds, milliseconds, seconds). + +In general, new logical types may be formed either by placing new semantics on +top of a single physical data type or some composition of physical or logical +types. For example: you could have a categorical type (a logical construct +consisting of multiple arrays of data) whose categories are some other logical +type. + +For historical reasons, **pandas never developed a clear or clean semantic +separation in its user API between logical and physical data types**. Also, the +addition of new, pandas-only "synthetic" dtypes that are unknown to NumPy (like +categorical, datetimetz, etc.) has expanded this conflation considerably. If +you also consider pandas's custom missing / NULL data behavior, the addition of +ad hoc missing data semantics to a physical NumPy data type created, by the +definitions above, a logical data type (call it ``object[nullable]`` for an +object array) without ever explicitly saying so. + +You might be thinking, "Good job, Wes. You really messed that up!" I'd be +inclined to agree with you now in retrospect, but back in 2011 pandas was not +the super popular project that it is today, and we were truly riding on NumPy's +coat tails. So the extent to which NumPy concepts and APIs were used explicitly +in pandas made the library easier to adopt. Now in 2016, this feels +anachronistic / outdated. + +High-level logical type proposal +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +As we have been discussing periodically on the pandas-dev mailing list and +GitHub, I am proposing that we start to unravel our current mess by defining +pandas-specific metadata objects that model the current semantics / behavior of +the project. What does this mean, exactly? + +* Each NumPy dtype object will map 1-to-1 to an equivalent ``pandas.DataType`` + object. +* Existing pandas "extension dtypes" (like ``CategoricalDtype`` and + ``DatetimeTZDtype``), which have been designed to mimic ``numpy.dtype``, will + become logical type subclasses of ``pandas.DataType`` like every other type + in pandas. + +Since pandas is about assisting with data manipulation and analysis, at some +point you must invoke functions that are specialized to the specific physical +memory representation of your data. For example, pandas has its own +implementations of ``ndarray.take`` that are used internally for arrays of +positive integers that may contain NULL / NA values (which are represented as +-1 -- search the codebase for implementations of ``take_1d``). + +The major goals of introducing a logical type abstraction are the follows: + +* Simplifying "dynamic dispatch": invoking the right functions or choosing the + right code branches based on the data type. +* Enabling pandas to decouple both its internal semantics and physical storage + from NumPy's metadata and APIs. Note that this is already happening with + categorical types, since a particular instance of ``CategoricalDtype`` may + physically be stored in one of 4 NumPy data types. + +Physical storage decoupling +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +By separating pandas data from the presumption of using a particular physical +``numpy.dtype`` internally, we can: + +* Begin to better protect users from NumPy data semantics (which are frequently + different from pandas's!) leaking through to the pandas user API. This can + enable us to address long-standing inconsistencies or "rough edges" in pandas + that have persisted due to our tight semantic coupling to NumPy. + +* We can consider adding new data structures to pandas, either custom to pandas + or provided by 3rd-party libraries, that add new functionality alongside the + existing code (presuming NumPy physical storage). As one concrete example, + discussed in more detail below, we can enable missing data in integer pandas + data by forming a composite data structure consisting of a NumPy array plus a + bitmap marking the null / not-null values. + +* We can start to think about improved behavior around data ownership (like + copy-on-write) which may yield many benefits. I will write a dedicated + section about this. + +Note that neither of these points implies that we are trying to use NumPy +less. We already have large amounts of code that implement algorithms similar +to those found in NumPy (e.g. ``pandas.unique`` or the implementation of +``Series.sum``), but taking into account pandas's missing data representation, +etc. Internally, we can use NumPy when its computational semantics match those +we've chosen for pandas, and elsewhere we can invoke pandas-specific code. + +A major concern here based on these ideas is **preserving NumPy +interoperability**, so I'll examine this topic in some detail next. + +Preserving NumPy interoperability +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some of types of intended interoperability between NumPy and pandas are as +follows: + +* Users can obtain the a ``numpy.ndarray`` (possibly a view depending on the + internal block structure, more on this soon) in constant time and without + copying the actual data. This has a couple other implications + + * Changes made to this array will be reflected in the source pandas object. + * If you write C extension code (possibly in Cython) and respect pandas's + missing data details, you can invoke certain kinds of fast custom code on + pandas data (but it's somewhat inflexible -- see the latest discussion on + adding a native code API to pandas). + +* NumPy ufuncs (like ``np.sqrt`` or ``np.log``) can be invoked on + pandas objects like Series and DataFrame + +* ``numpy.asarray`` will always yield some array, even if it discards metadata + or has to create a new array. For example ``asarray`` invoked on + ``pandas.Categorical`` yields a reconstructed array (rather than either the + categories or codes internal arrays) + +* Many NumPy methods designed to work on subclasses (or duck-typed classes) of + ``ndarray`` may be used. For example ``numpy.sum`` may be used on a Series + even though it does not invoke NumPy's internal C sum algorithm. This means + that a Series may be used as an interchangeable argument in a large set of + functions that only know about NumPy arrays. + +By and large, I think much of this can be preserved, but there will be some API +breakage. + +If we add more composite data structures (Categorical can be thought of as +one existing composite data structure) to pandas or alternate non-NumPy data +structures, there will be cases where the semantic information in a Series +cannot be adequately represented in a NumPy array. + +As one example, if we add pandas-only missing data support to integer and +boolean data (a long requested feature), calling ``np.asarray`` on such data +may not have well-defined behavior. As present, pandas is implicitly converting +these types to ``float64`` (see more below), which isn't too great. A decision +does not need to be made now, but the benefits of solving this long-standing +issue may merit breaking ``asarray`` as long as we provide an explicit way to +obtain the original casted ``float64`` NumPy array (with ``NaN`` for NULL/NA +values) + +For pandas data that does not step outside NumPy's semantic realm, we can +continue to provide zero-copy views in many cases. + +Missing data consistency +======================== + +Once the physical memory representation has been effectively decoupled from the +user API, we can consider various approaches to implementing missing data in a +consistent way for every logical pandas data type. + +To motivate this, let's look at some integer data: + +.. ipython:: python + + s = pd.Series([1, 2, 3, 4, 5]) + s + s.dtype + s.values + +If we assign a ``numpy.NaN``, see what happens: + +.. ipython:: python + + s[2] = np.NaN + s + s.dtype + s.values + +The story for boolean data is similar: + +.. ipython:: python + + s = pd.Series([True, False, True]) + s.dtype + s[2] = np.NaN + s.dtype + s.values + +This implicit behavior appears in many scenarios, such as: + +* Loading data from any source: databases, CSV files, R data files, etc. +* Joins or reindexing operations introducing missing data +* Pivot / reshape operations +* Time series resampling +* Certain types of GroupBy operations + +A proposed solution +~~~~~~~~~~~~~~~~~~~ + +My proposal for introducing missing data into any NumPy type outside of +floating point (which uses ``NaN`` for now) and Python object (which uses +``None`` or ``NaN`` interchangeably) is to **allocate and manage an internal +bitmap** (which the user never sees). This has numerous benefits: + +* 1 byte of memory overhead for each 8 values +* Bitmaps can propagate their nulls in C through bitwise ``&`` or ``|`` + operations, which are inexpensive. +* Getting and setting bits on modern hardware is CPU-inexpensive. For + single-pass array operations (like groupbys) on large arrays this may also + result in better CPU cache utilization (fewer main-memory reads of the + bitmap). +* Hardware and SIMD "popcount" intrinsics (which can operate on 64-128 bits at + a time) can be used to count bits and skip null-handling on segments of data + containing no nulls. + +Notably, this is the way that PostgreSQL handles null values. For example, we +might have: + +.. code-block:: + + [0, 1, 2, NA, NA, 5, 6, NA] + + i: 7 6 5 4 3 2 1 0 + bitmap: 0 1 1 0 0 1 1 1 + +Here, the convention of 1 for "not null" (a la PostgreSQL) and +least-significant bit ordering (LSB "bit endianness") is being used. + +Under the new regime, users could simply write: + +.. code-block:: python + + s[2] = pandas.NA + +and the data type would be unmodified. It may be necessary to write something +akin to: + +.. code-block:: python + + s.to_numpy(dtype=np.float64, na_rep=np.nan) + +and that would emulate the current behavior. Attempts to use ``__array__` (for +example: calling ``np.sqrt`` on the data) would result in an error since we +will likely want to refuse to make a guess as for what casting behavior the +user desires. + +Tradeoffs +~~~~~~~~~ + +One potential downside of the bitmap approach is that missing data implemented +outside of NumPy's domain will need to be explicitly converted if it is needed +in another library that only knows about NumPy. I argue that this is better +than the current implicit conversion which could yield data loss (for integers +falling outside the exact representable range for ``float64``). + +Removal of BlockManager / new DataFrame internals +================================================= + +Deep inside the belly pandas objects, there is a data structure called +``BlockManager`` which, at a high level, is responsible for managing the +physical arrays where the data inside a Series or DataFrame is looked +after (also Panel / PanelND structure, even though these are on their way to +deprecation). + +While this data structure has served pandas well since its birth 5 years ago +(Summer 2011), it has a number of problems that make its removal and +replacement with something else an attractive option. + +The goal of this section is to explain what the BlockManager is, why it exists +at all, and why we should consider removing it. + +What is ``BlockManager`` and why does it exist? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The reason that ``BlockManager`` exists at all goes back to some ancient pandas +history. Originally, the data in ``pandas.DataFrame`` was stored in a Python +``dict`` object. If you pull up pandas 0.1 or 0.2, you will see this. + +Since the business logic of pandas's internals was originally implemented in +pure Python, as it is still is (but much larger / more complex), there was a +marked performance difference between column-oriented operations and +row-oriented operations. The reason for this is not really a memory layout +issue (NumPy users know about how contiguous memory access produces much better +performance) so much as a reliance on NumPy's two-dimensional array operations +for carrying out pandas's computations. So, to do anything row oriented on an +all-numeric DataFrame, pandas would concatenate all of the columns together +(using ``numpy.vstack`` or ``numpy.hstack``) then use array broadcasting or +methods like ``ndarray.sum`` (combined with ``np.isnan`` to mind missing data) +to carry out certain operations. + +1. pandas's early users (i.e. AQR employees) beseeched me to address this + performance issue. Thus ``DataMatrix`` was created, a roughly API-equivalent + object whose internal storage was a 2D NumPy array, intended to be of a + homogeneous type (e.g. ``numpy.float64``). The downside of this was that if + you inserted a string column, everything would become ``numpy.object_`` + dtype. Users did not like that. + +2. It had become apparent that the dichotomy between DataFrame and DataMatrix + (and when to use each) was harming pandas's adoption and confusing users. So + I set about creating a hybrid data structure that had "the best of both + worlds". + +3. The idea was that the BlockManager would track collections of NumPy arrays + having the same dtype, particular as columns were inserted or removed + (i.e. the *building* phase of the DataFrame's lifetime). + +4. When you would invoke an operation that benefited from a single + *consolidated* 2-dimensional ndarray of say ``float64`` dtype (for example: + using ``reindex`` or performing a row-oriented operation), the BlockManager + would glue together its accumulated pieces to create a single 2D ndarray of + each data type. This is called **consolidation** in the codebase. + +5. Since in practice, heterogeneous DataFrames had different types interspersed + amongst their columns, the BlockManager maintains a mapping between the + absolute column position and the relative position within the type-specific + 2D "block". + +6. Over time, the BlockManager has been generalized for the 1 through N + dimensional cases, not just the 2D case, so that even Series has a lean + "SingleBlockManager" internally. + +Drawbacks of BlockManager +~~~~~~~~~~~~~~~~~~~~~~~~~ + +While this data structure has enabled pandas to make it this far in life, it +has a number of drawbacks (not a complete list): + +1. **Code complexity**: this has manifested in a number of ways (and probably + others that I'm missing) + + * Making some of the most important algorithms in pandas fast, like joins + and reshape operations, requires carefully constructing the precise block + structure of the output DataFrame so that no further copying or + consolidation will take place. + + * Adding new custom data types to DataFrame and not losing their metadata + (e.g. time zones or categories) has had a sort of "fan out" effect + touching numerous parts of the BlockManager internals. + +2. **Loss of user visibility into memory use and memory layout**: With large + data sets, some "naively" constructed DataFrame objects (e.g. from a dict of + ndarrays) can produce a memory-doubling effect that may cause out-of-memory + errors. Also, consolidated blocks can (depending on the version of pandas) + result in columns having strided / non-contiguous data, resulting in + degraded performance in column-oriented operations. + +3. **Unavoidable consolidation**: Fairly common operations, like ``read_csv``, + may require a consolidation step after completion, which for large data may + result in performance or memory overhead (similar to the above bullet + point). + +4. **Microperformance issues / indexing slowness**: since a DataFrame can be a + sort of many-layered onion, many common pandas operations may weave through + dozens of different functions navigating the structure of the object and + producing the appropriate output. I will talk more about microperformance + later. + +Replacing BlockManager without weakening pandas +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Our goal in replacing BlockManager would be to achieve: + +* Substantially simpler code +* Easier extensibility with new logical types +* Performance on par (or better) the current implementation +* Better user control over memory use and layout +* Improved microperformance + +I believe we can do this, but it's will require a significant inversion of the +internal code architecture to involve a more native code and less interpreted +Python. For example, it will be difficult or impossible to achieve comparable +performance in row-oriented operations (on consolidated DataFrame objects) with +pure Python code. + +In the next section, I will start making my case for creating a "native core" +library where we can assemble the low level data structures, logical types, and +memory management for pandas. Additionally, we would want to port much of +pandas's helper Cython code to live inside this library and operate directly on +the internal data structures rather than being orchestrated from the Python +interpreter level. + +Building "libpandas" in C++11/14 for lowest level implementation tier +===================================================================== + +Currently, pandas architecturally is structured as follows: + +* Pure Python implementation of internal data structure business logic +* Algorithms in Cython (more often) or C (less often) to accelerate + computationally-intensive algorithms + +While it's overall made pandas easier to develop and maintain internally +(perhaps increasingly less so over time!), this has had a number of drawbacks +as we've discussed. I mentioned microperformance above, so about that: + +Microperformance +~~~~~~~~~~~~~~~~ + +Microperformance (operations taking 1 microsecond to 1 millisecond) has +suffered considerably as pandas's internals have expanded to accommodate new +use cases. Fairly simple operations, from indexing to summary statistics, may +pass through multiple layers of scaffolding before hitting the lowest tier of +computations. Let's take for example: + +.. ipython:: python + + s = pd.Series(np.random.randn(100)) + s.sum() + +Profiling ``s.sum()`` with ``%prun`` in IPython, I am seeing 116 function +calls (pandas 0.18.1). Let's look at the microperformance: + +.. code-block:: text + + In [14]: timeit s.sum() + 10000 loops, best of 3: 31.7 µs per loop + + In [15]: v = s.values + + In [16]: timeit v.sum() + 1000000 loops, best of 3: 1.07 µs per loop + +While a slightly contrived example, the internal data structures and function +dispatch machinery add 30 microseconds of overhead. That may not be a +compelling number, but such a method called 1 million times has an additional +30 seconds of overhead. When you consider microperformance in the context of +custom ``groupby`` operations, for example, this may not be so unrealistic. + +C or C++ (C++11, to be specific)? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +At the risk of instigating a religious programming language debate, pandas's +use of Cython in many places is very C++-like: + +* Generic programming through manual code generation (now using tempita) + instead of templates +* Auxiliary types and data structures as ``cdef class`` extension types +* Relying on Python's reference counting for garbage collection and cleanup + after exceptions are raised. The "blend C and Cython" style has aided + developer productivity. + +I argue that judicious and responsible use of modern C++ (and following a +reasonable style guide like `Google's guide +`_, or some slight variation) +will enable us to: + +* Simplify our existing Cython codebase by using templates (and very limited, + template metaprogramming) + +* Easier generic programming / inlining of data-type specific logic at compile + time. + +* Use RAII (exception-safe allocation) and smart pointers (``std::unique_ptr`` + and ``std::shared_ptr``) to simplify memory management + +* Define performant C++ classes modeling the current internals, with various + mechanisms for code reuse or type-specific dynamic dispatch (i.e. through + template classes, CRTP, or simply virtual functions). + +* Use C++11 standard library concurrency tools to more easily create concurrent + / multithreaded implementations of common pandas algorithms. + +By pushing down much of the business logic into C++ (with use of the Python and +NumPy C API where relevant), we'll be able to achieve macroperformance on par +or better than the current BlockManager-based implementation and handily better +microperformance in indexing and simple analytics. + +``pandas.Array`` types +~~~~~~~~~~~~~~~~~~~~~~ + +My gut feeling is that we would want to create relatively simple container +classes having a common ``pandas::Array`` base type in C++, each of which +models a particular logical type. Each array type would have a corresponding +logical type implementation, in the vein of: + +.. code-block:: c++ + + class Array { + // public API omitted + private: + std::shared_ptr type_; + } + + class CategoricalType : public DataType { + // implementation + + private: + std::shared_ptr categories_; + }; + + class CategoricalArray : public Array { + public: + std::shared_ptr codes() const; + std::shared_ptr categories() const; + // rest of implementation omitted + }; + +An array containing a NumPy array will invoke ``Py_DECREF`` in its destructor, +so that after construction one can proceed largely with C++ programming +semantics without much need for manual memory management. + +These Array types would be wrapped and exposed to pandas developers (probably +in Cython). + +Index types +~~~~~~~~~~~ + +Like pandas's current code structure, Index types would be composed from the +Array types and some additional data structures (hash tables) for lookups and +other index operations. These can be similarly exposed to the world via Cython +(and wrapped in a convenient pandas.Index class). + +``pandas.Table`` +~~~~~~~~~~~~~~~~ + +My recommendation is to decommission the BlockManager in favor of a much +simpler low-level Table class, which operates more similarly to an R data.frame +(e.g. no row index). This would look something like + +.. code-block:: c++ + + class Table { + public: + std::shared_ptr GetColumn(int i); + void SetColumn(int i, const std::shared_ptr& arr); + + // rest of public API omitted + private: + // Column index, possibly not necessary + std::shared_ptr columns_; + + // List of arrays + std::vector> data_; + }; + +Operators and dynamic dispatch +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Under this proposed class structure, it may not make sense to add operations as +class methods. We could possibly do something like: + +.. code-block:: c++ + + #include "pandas/dispatch.h" + + // other includes omitted + + using ArrayRef = std::shared_ptr; + + template + inline ArrayRef TakeImpl(U, V) { + // Implementation omitted + } + + ArrayRef Take(ArrayRef values, ArrayRef indices) { + return Dispatch(values, indices); + } + +Here, the Dispatch template would generate the matrix of logical type +combinations, some of which might throw a not implemented exception. + +There's other approaches to dealing with runtime dispatch that don't feature +too much overhead. + +Memory accounting +~~~~~~~~~~~~~~~~~ + +If pandas's internals are encapsulated in C++ classes inside the libpandas core +library, we could atomically track all memory allocations and deallocations to +produce a precise accounting of the number of bytes that pandas has currently +allocated (that are not opaque, so Python objects would only include their +``PyObject**`` array footprint). + +Development toolchain +~~~~~~~~~~~~~~~~~~~~~ + +Introducing C++11 to pandas's development toolchain will add quite a bit of +complexity for developers, especially compared with pandas's current Cython and +C codebase which basically builds out of the box for most people. It would be +better for cross-platform support to use CMake than something else (distutils +doesn't have adequate support for C++). + +Logical types for strings and possibly other non-numeric data +============================================================= + +I believe that frequently-occurring data types, such as UTF8 strings, are +important enough to deserve a dedicated logical pandas data type. This will +enable us both to enforce tighter API semantics (i.e. attempts to assign a +non-string into string data will be a ``TypeError``) and improved performance +and memory use under the hood. I will devote an entire section to talking about +strings. + +In general, I would be supportive of making Python object (``numpy.object_`` +dtype) arrays the solution only for mixed-type arrays and data types for which +pandas has no native handling. + +3rd-party native API (i.e. Cython and C / C++) +============================================== + +Developers of 3rd-party projects (myself included) have often expressed a +desire to be able to inspect, construct, or otherwise manipulate pandas objects +(if even in a limited fashion) in compiled code (Cython, C, or C++). + +Per the discussion of libpandas and a native core, I would propose the +following: + +* Define public-facing ``.pxd`` files that allow developers to use ``cimport`` + and get access to pandas's internal extension types. +* Define factory function that enable fully formed Series and DataFrame objects + to be constructed either by Cython API calls or potentially also C++ + libpandas API calls. +* Provide Cython APIs for 3rd-party developers to obtain pointers to access the + underlying C++ objects contained in the wrapper Python objects diff --git a/doc/pandas-2.0/source/removals.rst b/doc/pandas-2.0/source/removals.rst new file mode 100644 index 0000000000000..5f10485b31405 --- /dev/null +++ b/doc/pandas-2.0/source/removals.rst @@ -0,0 +1,78 @@ +.. _removals: + +================================ + Code to remove and other ideas +================================ + +Dropping Python 2 support +========================= + +With Python 2.7 reaching its supported end-of-life in 2020, like some other +Python projects (e.g. IPython / Jupyter) we should seriously contemplate making +pandas 2.0 only support Python 3.5 and higher. In addition to lowering the +development burden at both the C API and pure Python level, we can also finally +look to take advantage of features (things like ``asyncio``, maybe?) only +available in Python 3. + +Deprecated code to remove +========================= + +* ``.ix`` indexing entirely +* ``Panel`` and ``PanelND`` classes +* Plotting? + +Other ideas +=========== + +Here's a collection of other miscellaneous ideas that don't necessarily fit +elsewhere in these documents. + +Column statistics +~~~~~~~~~~~~~~~~~ + +In quite a few pandas algorithms, there are characteristics of the data that +are very useful to know, such as: + +* **Monotonicity**: for comparable data (e.g. numbers), is the data sorted / + strictly increasing? In time series, this permits sorting steps to be + skipped. + +* **Null count**: for data not containing any nulls, the null handling path in + some algorithms can be skipped entirely + + + +Strided arrays: more trouble than they are worth? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Per the general discussion around changing DataFrame's internals to contain a +list / ``std::vector`` of arrays internally, for me this begs the question of +the benefits of continuing to accommodate strided one-dimensional data. + +Some pros for eliminating strided data completely: + +* Guaranteeing contiguous memory internally will yield more consistent and + predictable performance. + +* Not needing to consider a stride different from 1 means simpler low-level + array indexing code (e.g. you can work with plain C arrays). The stride is a + complexity / overhead that leaks to every algorithm that iterates over an + array. + +* You avoid strange situations where a strided view holds onto a base ndarray + reference to a much larger array + +* **Example:** ``_. Here, the + internal orientation (column-major vs. row-major) is not clear to the user. + +Some cons: + +* It would not be possible to perform zero-copy computations on a strided NumPy + array + +* Relatedly, initializing a Series or DataFrame from strided memory would + require allocating an equivalent amount of contiguous memory for each of the + columns. + +For me, at least, I don't find the cons compelling enough to warrant the code +complexity tradeoff. diff --git a/doc/pandas-2.0/source/strings.rst b/doc/pandas-2.0/source/strings.rst new file mode 100644 index 0000000000000..aa065cee4844f --- /dev/null +++ b/doc/pandas-2.0/source/strings.rst @@ -0,0 +1,195 @@ +.. _strings: + +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + np.set_printoptions(precision=4, suppress=True) + pd.options.display.max_rows = 100 + +================================== + Enhanced string / UTF-8 handling +================================== + +There are some things we can do to make pandas use less memory and perform +computations significantly faster on string data. + +Current string problems +======================= + +pandas offers support for columns containing strings (ASCII or Unicode) on a +somewhat ad hoc basis. + +* Strings are stored in NumPy arrays of ``PyObject*`` / ``numpy.object_`` + dtype. This has several problems + + * Computations (e.g. ``groupby`` operations) typically utilize a code path + for generic Python objects. For example comparisons or hashing goes through + the ``PyObject_*`` C API functions. In addition to harming multithreading + due to GIL contention (you must acquire the GIL to use these functions), + these can also be significantly slower than algorithms that operate on + ``const char*``, potentially taking advantage of hardware optimizations. + + * String arrays often feature many copies of or references to the same + PyString. Thus, some algorithms may perform redundant computation. Some + parts of pandas, like ``pandas.read_csv``, make an effort to deduplicate + strings to free memory and accelerate computations (e.g. if you do ``x == + y``, and ``x`` and ``y`` are references to the same ``PyObject*``, Python + skips comparing their internal data). + + * Note that this is somewhat mitigated by using ``pandas.Categorical``, but + this is not the default storage mechanism. More on this below. + + * Using ``PyString`` objects and ``PyObject*`` NumPy storage adds non-trivial + overhead (approximately 24 bytes per unique object, see `this exposition + `_ for a deeper drive) to + each value. + +Possible solution: new non-NumPy string memory layout +===================================================== + +My proposed solution to the string conundrum is the following: + +* Create a custom string array container type suitable for use in a + ``pandas.Array``, and a ``pandas.string`` logical data type. +* Require that all strings be encoded as UTF-8. +* By default, represent all string arrays internally as dictionary-encoded + a.k.a. categorical. Thus, we will typically only ever have 1 copy of any + given string in an array. +* Store the actual string data in a packed UTF-8 buffer. I have seen this in a + number of places, but notably it's the way that `Apache Arrow implements + variable-length collections + `_. + +Here is one possible C struct-like layout of this container: + +.. code-block:: c++ + + typedef struct { + /* Category / dictionary indices into the string data */ + uint32_t* indices; + + /* The encoded string lengths */ + uint32_t* offsets; + + /* The packed UTF-8 data */ + const char* data; + + /* For nullness */ + uint8_t* bitmap; + } string_array_t; + +Here's an example of what the data would look like: + +.. code-block:: text + + actual data : ['foo', 'bars', 'foo', null, 'bars'] + + indices: [0, 1, 0, 0, 1] + + bitmap[0] + bitmap (read right-to-left): 0 0 0 1 0 1 1 1 | + + offsets: [0, 3, 7] + data: ['f', 'o', 'o', 'b', 'a', 'r', 's'] + +Some benefits of this approach include: + +* Much better data locality for low-cardinality categorical data +* 8.125 bytes (8 bytes plus 1 bit) of memory overhead per value versus 24 bytes + (the current) +* The data is already categorical: cast to ``category`` dtype can be perform + very cheaply and without duplicating the underlying string memory buffer +* Computations like ``groupby`` on dictionary-encoded strings will be as + performant as those on Categorical currently are. performant + +Some drawbacks + +* This memory layout is best used as an immutable representation. Mutating + slots here becomes more complex. Whether single value assignments or put / + array-assignment may likely require constructing a new ``data`` buffer + (either by ``realloc`` or some other copying mechanism). Without a compaction + / "garbage collection" step on this buffer it will be possible to have "dead" + memory inside it (for example, if you did ``arr[:] = 'a-new-string-value'``, + all the existing values would be orphaned). + + * Some systems have addressed this issue by storing all string data in a + "global string hash table". This is something we could explore, but it + would add quite a bit of complexity to implement and may not be worthwhile + at this time. + +* Indexing into this data structure to obtain a single Python object will + probably want to call ``PyUnicode_FromStringAndSize`` to construct a string + (Python 3, therefore Unicode). This requires a memory allocation, whereas it + currently only has to do a ``Py_INCREF``. + +* Many of pandas's existing algorithms assuming Python objects would need to be + specialized to take advantage of this new memory layout. This is both a pro + and a con as it will most likely yield significantly better performance. + +Concerns / problems +=================== + +Preserving code that assumes PyString objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Any alternate UTF-8 string in-memory representation should necessarily be able +to yield Python string objects using ``PyUnicode_FromStringAndSize``. Thus, +code like this could continue to work: + +.. ipython:: python + + s = pd.Series(["como estás?"]) + s.map(lambda x: x.upper()) + +One trade-off is that creating the temporary Python strings is potentially +costly. This could be mitigated for Python ``str`` methods (optimized +array-oriented code path under the hood), but for arbitrary functions you would +have to pay. + +Accommodating Non-UTF-8 data +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some pandas users will have code that involves various non-UTF-8 Python string +types: + +* Native unicode: Py_UCS1, Py_UCS2, Py_UCS4 +* Non-UTF-8 PyBytes + +.. ipython:: python + + s = pd.Series(["como estás?"]) + s + s.str.encode('latin-1') + s.str.encode('latin-1').str.decode('latin-1') + +Such data could arise from reading a CSV file in a non-UTF-8 encoding, and you +did not indicate the encoding to ``pandas.read_csv``. + +My proposed solution to this is to provide a ``binary`` logical type having the +same physical memory layout as UTF-8 strings, with only the metadata being +different. So you would have the following semantics: + +* ``latin1_s = s.encode('latin-1')``: this yields a ``binary`` view and + allocates new memory. +* ``utf8_s = s.encode('utf-8')``: this is a no-op, but yields a ``binary`` view. +* ``s2 = utf8_s.decode('utf-8')``: this requires using a Unicode codec to + validate indicated codec. + +Indexing and slicing +~~~~~~~~~~~~~~~~~~~~ + +Storing strings as UTF-8 bytes means that things like this become more +complicated: + +.. ipython:: python + + s = pd.Series(["estás está estáis"]) + s.str[9] + s.str[6:10] + +Since UTF-8 is a variable length encoding, finding the logical character by +position will need to make use of the Python C API (expensive, requires +creating new Python objects) or a 3rd party library. We could make use of the +`ICU C++ Libraries `_ to implement this.