diff --git a/doc/pandas-2.0/.gitignore b/doc/pandas-2.0/.gitignore
new file mode 100644
index 0000000000000..aeffbbed984ff
--- /dev/null
+++ b/doc/pandas-2.0/.gitignore
@@ -0,0 +1,2 @@
+pandas2-design
+_build
diff --git a/doc/pandas-2.0/Makefile b/doc/pandas-2.0/Makefile
new file mode 100644
index 0000000000000..654dda170fe37
--- /dev/null
+++ b/doc/pandas-2.0/Makefile
@@ -0,0 +1,233 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+
+.PHONY: clean
+clean:
+	rm -rf $(BUILDDIR)/*
+
+.PHONY: html
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+.PHONY: dirhtml
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+.PHONY: singlehtml
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+.PHONY: pickle
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+.PHONY: json
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+.PHONY: htmlhelp
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+.PHONY: qthelp
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pandas20DesignDocs.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pandas20DesignDocs.qhc"
+
+.PHONY: applehelp
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+.PHONY: devhelp
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/pandas20DesignDocs"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pandas20DesignDocs"
+	@echo "# devhelp"
+
+.PHONY: epub
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+.PHONY: latex
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+.PHONY: latexpdf
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: latexpdfja
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: text
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+.PHONY: man
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+.PHONY: texinfo
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+.PHONY: info
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+.PHONY: gettext
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+.PHONY: changes
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+.PHONY: linkcheck
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+.PHONY: doctest
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+.PHONY: coverage
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+.PHONY: xml
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+.PHONY: pseudoxml
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+
+OUTPUTDIR=_build/html
+DEPLOYREPOSITORY=pandas2-design
+
+deploy: html
+	if test -d $(OUTPUTDIR); \
+	then echo " (build directory exists)"; \
+	else mkdir -p $(OUTPUTDIR); \
+	fi
+	if test -d $(DEPLOYREPOSITORY); \
+	then echo "  (repository directory exists)"; \
+	else git clone git@github.com:wesm/$(DEPLOYREPOSITORY).git; \
+	fi
+	cd $(DEPLOYREPOSITORY) && git pull
+	rsync -r $(OUTPUTDIR)/* $(DEPLOYREPOSITORY)/
+	cd $(DEPLOYREPOSITORY) && git add . && git commit -m "deploy"
+	cd $(DEPLOYREPOSITORY) && git push origin gh-pages
diff --git a/doc/pandas-2.0/index.rst b/doc/pandas-2.0/index.rst
new file mode 100644
index 0000000000000..6775e45c84b1f
--- /dev/null
+++ b/doc/pandas-2.0/index.rst
@@ -0,0 +1,22 @@
+.. pandas 2.0 Design Docs documentation master file, created by
+   sphinx-quickstart on Mon Aug  8 11:48:39 2016.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to pandas 2.0 Design Docs's documentation!
+==================================================
+
+Contents:
+
+.. toctree::
+   :maxdepth: 2
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+
diff --git a/doc/pandas-2.0/make.bat b/doc/pandas-2.0/make.bat
new file mode 100644
index 0000000000000..4c22f56b72e9f
--- /dev/null
+++ b/doc/pandas-2.0/make.bat
@@ -0,0 +1,263 @@
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set BUILDDIR=_build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
+set I18NSPHINXOPTS=%SPHINXOPTS% .
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html       to make standalone HTML files
+	echo.  dirhtml    to make HTML files named index.html in directories
+	echo.  singlehtml to make a single large HTML file
+	echo.  pickle     to make pickle files
+	echo.  json       to make JSON files
+	echo.  htmlhelp   to make HTML files and a HTML help project
+	echo.  qthelp     to make HTML files and a qthelp project
+	echo.  devhelp    to make HTML files and a Devhelp project
+	echo.  epub       to make an epub
+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  text       to make text files
+	echo.  man        to make manual pages
+	echo.  texinfo    to make Texinfo files
+	echo.  gettext    to make PO message catalogs
+	echo.  changes    to make an overview over all changed/added/deprecated items
+	echo.  xml        to make Docutils-native XML files
+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
+	echo.  linkcheck  to check all external links for integrity
+	echo.  doctest    to run all doctests embedded in the documentation if enabled
+	echo.  coverage   to run coverage check of the documentation if enabled
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+
+REM Check if sphinx-build is available and fallback to Python version if any
+%SPHINXBUILD% 1>NUL 2>NUL
+if errorlevel 9009 goto sphinx_python
+goto sphinx_ok
+
+:sphinx_python
+
+set SPHINXBUILD=python -m sphinx.__init__
+%SPHINXBUILD% 2> nul
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+:sphinx_ok
+
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "singlehtml" (
+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\pandas20DesignDocs.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\pandas20DesignDocs.ghc
+	goto end
+)
+
+if "%1" == "devhelp" (
+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished.
+	goto end
+)
+
+if "%1" == "epub" (
+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The epub file is in %BUILDDIR%/epub.
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdf" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "latexpdfja" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	cd %BUILDDIR%/latex
+	make all-pdf-ja
+	cd %~dp0
+	echo.
+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "text" (
+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The text files are in %BUILDDIR%/text.
+	goto end
+)
+
+if "%1" == "man" (
+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The manual pages are in %BUILDDIR%/man.
+	goto end
+)
+
+if "%1" == "texinfo" (
+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
+	goto end
+)
+
+if "%1" == "gettext" (
+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+if "%1" == "coverage" (
+	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Testing of coverage in the sources finished, look at the ^
+results in %BUILDDIR%/coverage/python.txt.
+	goto end
+)
+
+if "%1" == "xml" (
+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The XML files are in %BUILDDIR%/xml.
+	goto end
+)
+
+if "%1" == "pseudoxml" (
+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
+	if errorlevel 1 exit /b 1
+	echo.
+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
+	goto end
+)
+
+:end
diff --git a/doc/pandas-2.0/source/conf.py b/doc/pandas-2.0/source/conf.py
new file mode 100644
index 0000000000000..071c8bd31ea6c
--- /dev/null
+++ b/doc/pandas-2.0/source/conf.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# pandas 2.0 Design Docs documentation build configuration file, created by
+# sphinx-quickstart on Mon Aug  8 11:48:39 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+
+extensions = ['IPython.sphinxext.ipython_directive',
+              'IPython.sphinxext.ipython_console_highlighting']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = "Wes's pandas 2.0 Design Docs"
+copyright = '2016, Wes McKinney'
+author = 'Wes McKinney'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.1'
+# The full version, including alpha/beta/rc tags.
+release = '0.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+import sphinx_rtd_theme
+
+html_theme = "sphinx_rtd_theme"
+
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr'
+#html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# Now only 'ja' uses this config value
+#html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pandas20DesignDocsdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+
+# Latex figure (float) alignment
+#'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'pandas20DesignDocs.tex', 'pandas 2.0 Design Docs Documentation',
+     'Wes McKinney', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pandas20designdocs', 'pandas 2.0 Design Docs Documentation',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'pandas20DesignDocs', 'pandas 2.0 Design Docs Documentation',
+     author, 'pandas20DesignDocs', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/doc/pandas-2.0/source/copyonwrite.rst b/doc/pandas-2.0/source/copyonwrite.rst
new file mode 100644
index 0000000000000..321614171c873
--- /dev/null
+++ b/doc/pandas-2.0/source/copyonwrite.rst
@@ -0,0 +1,5 @@
+.. _copyonwrite:
+
+==================================
+ View semantics and Copy-On-Write
+==================================
diff --git a/doc/pandas-2.0/source/goals.rst b/doc/pandas-2.0/source/goals.rst
new file mode 100644
index 0000000000000..cc2ff6467e271
--- /dev/null
+++ b/doc/pandas-2.0/source/goals.rst
@@ -0,0 +1,194 @@
+.. _goals:
+
+=======================
+ Goals and Motivations
+=======================
+
+.. note::
+
+  These documents are largely written by Wes McKinney, and at this point
+  reflect his opinions for the time being. Many things may change as we discuss
+  and work to reach a consensus about the path forward.
+
+The pandas codebase is now over 8 years old, having grown to over 200,000 lines
+of code from its original ~10,000 LOC in the original 0.1 open source release
+in January 2010.
+
+At a high level, the "pandas 2.0" effort is based on a number of observations:
+
+* The pandas 0.x series of releases have consisted with huge amounts of
+  iterative improvements to the library along with some major new features, bug
+  fixes, and improved documentation. There have also been a series of
+  deprecations, API changes, and other evolutions of pandas's API to account
+  for suboptimal design choices (for example: the ``.ix`` operator) made in the
+  early days of the project (2010 to 2012).
+* The unification of Series and DataFrame internals to be based on a common
+  ``NDFrame`` base class and "block manager" data structure (originally created
+  by me in 2011, and heroically driven forward to its modern form by Jeff
+  Reback), while introducing many benefits to pandas, has come to be viewed as
+  a long-term source of technical debt and code complexity.
+* pandas's ability to support an increasingly broad set of use cases has been
+  significantly constrained (as will be examined in detail in these documents)
+  by its tight coupling to NumPy and therefore subject to various limitations
+  in NumPy.
+* Making significant functional additions (particularly filling gaps in NumPy)
+  to pandas, particularly new data types, has grown increasingly complex with
+  very obvious accumulations of technical debt.
+* pandas is being used increasingly for very large datasets on machines with
+  many cores and large amounts of RAM (100s of gigabytes to terabytes). It
+  would be nice to be able to better utilize these larger, beefier systems
+  within a single Python process.
+* pandas is being used increasingly as a computational building block of some
+  larger system, such as Dask or Apache Spark. We should consider reducing the
+  overhead for making data accessible to pandas (i.e. via memory-mapping or
+  other low-overhead memory sharing).
+* Rough edges in pandas's implementation (e.g. its handling of missing data
+  across data types) are being exposed to users.
+
+These documents are largely concerned with pandas's internal design, which is
+mostly invisible to average users. Advanced users of pandas are generally
+familiar with some of these internal details, particular around performance and
+memory use, and so the degree to which users are impacted will vary quite a
+lot.
+
+Key areas of work
+=================
+
+Possible changes or improvements to pandas's internals fall into a number of
+different buckets to be explored in great detail:
+
+* **Decoupling from NumPy while preserving interoperability**: by eliminating
+  the presumption that pandas objects internally must contain data stored in
+  NumPy ``ndarray`` objects, we will be able to bring more consistency to
+  pandas's semantics and enable the core developers to extend pandas more
+  cleanly with new data types, data structures, and computational semantics.
+* **Exposing a pandas Cython and/or C/C++ API to other Python library
+  developers**: the internals of Series and DataFrame are only weakly
+  accessible in other developers' native code. At minimum, we wish to better
+  enable developers to construct the precise data structures / memory
+  representation that fill the insides of Series and DataFrame.
+* **Improving user control and visibility of memory use**: pandas's memory use,
+  as a result of its internal implementation, can frequently be opaque to the
+  user or outright unpredictable.
+* **Improving performance and system utilization**: We aim to improve both the
+  micro (operations that take < 1 ms) and macro (all other operations)
+  performance of pandas across the board. As part of this, we aim to make it
+  easier for pandas's core developers to leverage multicore systems to
+  accelerate computations (without running into any of Python's well-known
+  concurrency limitations)
+* **Removal of deprecated / underutilized functionality**: As the Python data
+  ecosystem has grown, a number of areas of pandas (e.g. plotting and datasets
+  with more than 2 dimensions) may be better served by other open source
+  projects. Also, functionality that has been explicitly deprecated or
+  discouraged from use (like the ``.ix`` indexing operator) would ideally be
+  removed.
+
+Non-goals / FAQ
+===============
+
+As this will be a quite nuanced discussion, especially for those not intimately
+familiar with pandas's implementation details, I wanted to speak to a couple of
+commonly-asked questions in brief:
+
+````
+
+1. **Will this work make it harder to use pandas with NumPy, scikit-learn,
+   statsmodels, SciPy, or other libraries that depend on NumPy
+   interoperability?**
+  * We are not planning on it. Data that is representable without memory
+    copying or conversion in NumPy arrays will continue to be 100%
+    interoperable.
+  * Data containing missing (NA) values may require explicit conversion where
+    it is not currently required. For example: integer or boolean type arrays
+    with missing data. I trust this will be seen as a positive development.
+  * If anything, more performant and more precise data semantics in pandas will
+    generally make production code using a downstream library like scikit-learn
+    more dependable and future-proof.
+
+````
+
+2. **By decoupling from NumPy, it sounds like you are reimplementing NumPy or
+   adding a new data type system**
+
+   * Simply put: no. But it's more complicated than that because of the
+     numerous interpretations of "type system".
+
+   * pandas already contains a large amount (10s of KLOCs) of custom
+     computational code (see, for example,
+     `<https://github.com/pydata/pandas/tree/master/pandas/src>`_) that implements
+     functionality not present in NumPy.
+
+   * pandas already features its own (what I will describe as a) "logical type
+     system", including things like custom data types (such as that of
+     ``pandas.Categorical``), pandas-specific missing data representation, and
+     implicit type casting (e.g. integer to float on introduction of missing
+     data). Unfortunately, these logical data types are somewhat weakly
+     expressed, and the mix of NumPy dtype objects and custom pandas types is
+     problematic for many internal (implementation) and external (user API)
+     reasons. I will examine in detail the difference between **physical
+     types** (i.e. NumPy's dtypes) and **logical types** (i.e. what pandas
+     currently has, implicitly).
+
+````
+
+3. **Shouldn't you try to accomplish your goals by contributing work to NumPy
+   instead of investing major work in pandas's internals?**
+
+   * In my opinion, this is a "false dichotomy"; i.e. these things are not
+     mutually exclusive.
+
+   * Yes, we should define, scope, and if possible help implement improvements
+     to NumPy that make sense. As NumPy serves a significantly larger and more
+     diverse set of users, major changes to the NumPy C codebase must be
+     approached more conservatively.
+
+   * It is unclear that pandas's body of domain-specific data handling and
+     computational code is entirely "in scope" for NumPy. Some technical
+     details, such as our categorical or datetime data semantics, "group by"
+     functionality, relational algebra (joins), etc., may be ideal for pandas
+     but not necessarily ideal for a general user of NumPy. My opinion is that
+     functionality from NumPy we wish to use in pandas should "pass through" to
+     the user unmodified, but we must retain the flexibility to work "outside
+     the box" (implement things not found in NumPy) without adding technical
+     debt or user API complexity.
+
+````
+
+4. **API changes / breaks are thought to be bad; don't you have a
+   responsibility to maintain backwards compatibility for users that heavily
+   depend on pandas?**
+
+   * It's true that APIs should not be broken or changed, and as such should be
+     approached with extreme caution.
+
+   * The goal of the pandas 2.0 initiative is to only make "good" API breaks
+     that yield a net benefit that can be easily demonstrated. As an example:
+     adding native missing data support to integer and boolean data (without
+     casting to another physical storage type) may break user code that has
+     knowledge of the "rough edge" (the behavior that we are fixing). As these
+     changes will mostly affect advanced pandas users, I expect they will be
+     welcomed.
+
+   * Any major API change or break will be documented and justified to assist
+     with code migration.
+
+   * As soon as we are able, we will post binary development artifacts for the
+     pandas 2.0 development branch to get early feedback from heavy pandas
+     users to understand the impact of changes and how we can better help the
+     existing user base.
+
+   * Some users will find that a certain piece of code has been working "by
+     accident" (i.e. relying upon undocumented behavior). This kind of breakage
+     is already a routine occurrence unfortunately.
+
+Summary
+=======
+
+Overall, the goal of the pandas 2.0 project is to yield a faster, more cleanly
+architected, and more future-proof library that is a drop-in replacement for
+90-95% of pandas user code. There will be API / code breakages, but the intent
+of any code breakage will almost always be to fix something that has been
+"wrong" or inconsistent. Many advanced users will have worked around some of
+these rough edges, and so their workarounds may either need to be removed or
+changed to accommodate the new (and hopefully it can be agreed in each case:
+better) semantics.
diff --git a/doc/pandas-2.0/source/index.rst b/doc/pandas-2.0/source/index.rst
new file mode 100644
index 0000000000000..70a2c25bbf2b5
--- /dev/null
+++ b/doc/pandas-2.0/source/index.rst
@@ -0,0 +1,24 @@
+Wes's pandas 2.0 Design Documents
+=================================
+
+These are a set of documents, based on discussions started in December 2015, to
+assist with discussions around changes to Python pandas's internal design
+intended to better accommodate the evolving needs of the growing Python data
+userbase and to help ensure that pandas remains a relevant and important
+project in the future.
+
+.. toctree::
+   :maxdepth: 3
+
+   goals
+   internal-architecture
+   strings
+   copyonwrite
+   removals
+
+.. Indices and tables
+.. ==================
+
+.. * :ref:`genindex`
+.. * :ref:`modindex`
+.. * :ref:`search`
diff --git a/doc/pandas-2.0/source/internal-architecture.rst b/doc/pandas-2.0/source/internal-architecture.rst
new file mode 100644
index 0000000000000..c3d38e2957aa3
--- /dev/null
+++ b/doc/pandas-2.0/source/internal-architecture.rst
@@ -0,0 +1,714 @@
+.. _internal-architecture:
+
+.. ipython:: python
+   :suppress:
+
+   import numpy as np
+   import pandas as pd
+   np.set_printoptions(precision=4, suppress=True)
+   pd.options.display.max_rows = 100
+
+===============================
+ Internal Architecture Changes
+===============================
+
+Logical types and Physical Storage Decoupling
+=============================================
+
+Since this is the most important, but perhaps also most controversial, change
+(in my opinion) to pandas, I'm going to go over it in great detail. I think the
+hardest part is coming up with clear language and definitions for concepts so
+that we can communicate effectively. For example the term "data type" is vague
+and may mean different things to different people.
+
+A motivating example
+~~~~~~~~~~~~~~~~~~~~
+
+Before digging too much into the technical details and problems/solutions,
+let's look at some code examples. It is not unusual to find code like this in
+pandas's internals:
+
+.. code-block:: python
+
+    def create_from_value(value, index, dtype):
+        # return a new empty value suitable for the dtype
+
+        if is_datetimetz(dtype):
+            subarr = DatetimeIndex([value] * len(index), dtype=dtype)
+        elif is_categorical_dtype(dtype):
+            subarr = Categorical([value] * len(index))
+        else:
+            if not isinstance(dtype, (np.dtype, type(np.dtype))):
+                dtype = dtype.dtype
+            subarr = np.empty(len(index), dtype=dtype)
+            subarr.fill(value)
+
+or
+
+.. code-block:: python
+
+   if is_categorical_dtype(dtype):
+       upcast_cls = 'category'
+   elif is_datetimetz(dtype):
+       upcast_cls = 'datetimetz'
+   elif issubclass(dtype.type, np.bool_):
+       upcast_cls = 'bool'
+   elif issubclass(dtype.type, np.object_):
+       upcast_cls = 'object'
+   elif is_datetime64_dtype(dtype):
+       upcast_cls = 'datetime'
+   elif is_timedelta64_dtype(dtype):
+       upcast_cls = 'timedelta'
+   else:
+       upcast_cls = 'float'
+
+I've cherry-picked one of a number of places where this type of datatype-based
+branching happens.
+
+The primary reason for this complexity is that pandas is using both NumPy's
+dtype objects (which describe *physical storage*) as well as its own custom
+data type objects as a proxy for pandas's *semantic logical types*.
+
+Let's step back for a second and come up with clear language to steer the
+discussion.
+
+Some definitions
+~~~~~~~~~~~~~~~~
+
+Here is my attempt at definitions of some of the key terms:
+
+* **Metadata**: data that describes other data (such as its in-memory layout)
+
+* **Semantics**: The meaning / abstract interpretation of something. We often
+  discuss the semantics (meaning) of computer programs (i.e. what they do,
+  fundamentally) without touching upon low level details like machine
+  representation, programming languages, compilers, operating systems, etc.
+
+* **Physical data (or storage) types**: these are metadata objects which
+  provide a description of the precise structure of a piece of data in memory.
+
+  * In NumPy, the ``numpy.dtype`` object (aka ``PyArray_Descr`` in the C API)
+    is metadata describing a single cell / value in an array. Combined with the
+    ``shape`` and ``strides`` attributes of the ``ndarray`` object, you have
+    enough information to perform O(1) random access on any cell in an
+    ``ndarray`` and to assign these values to a C type (or, in the case, of
+    structured dtypes, assign to a packed C struct).
+
+  * This may or may not include a physical representation of NULL or missing
+    data (for example: nullable float64 might be a physical type indicating a
+    normal float64 array along with a bitmap of null/not-null indicators).
+
+* **Logical data type**: metadata which describes the semantic content of a
+  single value in an array or other collection of values. Depending on the
+  logical type, it may map 1-to-1 to a physical type or not at all. Here are
+  some examples:
+
+  * The ``double`` or ``float64`` type may be viewed both as a logical type as
+    well as a physical type (a 1-to-1 correspondence).
+
+  * pandas's ``category`` dtype contains its own auxiliary array of category
+    values (for example, the distinct strings collected from a string
+    array). Based on the number of categories, the category ``codes`` (which
+    reference the categories array) are stored in the smallest possible integer
+    physical type (from ``int8`` to ``int64``, depending whether the data type
+    can accommodate the codes). For example, if there are 50 codes, the data is
+    represented in ``int8`` storage. For 1000 codes, it would be ``int16``.
+
+  * Another example: timestamps may be physically stored in ``int64``
+    storage, and these values are interpreted in the context of a particular
+    time unit or resolution (e.g. nanoseconds, milliseconds, seconds).
+
+In general, new logical types may be formed either by placing new semantics on
+top of a single physical data type or some composition of physical or logical
+types. For example: you could have a categorical type (a logical construct
+consisting of multiple arrays of data) whose categories are some other logical
+type.
+
+For historical reasons, **pandas never developed a clear or clean semantic
+separation in its user API between logical and physical data types**. Also, the
+addition of new, pandas-only "synthetic" dtypes that are unknown to NumPy (like
+categorical, datetimetz, etc.) has expanded this conflation considerably. If
+you also consider pandas's custom missing / NULL data behavior, the addition of
+ad hoc missing data semantics to a physical NumPy data type created, by the
+definitions above, a logical data type (call it ``object[nullable]`` for an
+object array) without ever explicitly saying so.
+
+You might be thinking, "Good job, Wes. You really messed that up!" I'd be
+inclined to agree with you now in retrospect, but back in 2011 pandas was not
+the super popular project that it is today, and we were truly riding on NumPy's
+coat tails. So the extent to which NumPy concepts and APIs were used explicitly
+in pandas made the library easier to adopt. Now in 2016, this feels
+anachronistic / outdated.
+
+High-level logical type proposal
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As we have been discussing periodically on the pandas-dev mailing list and
+GitHub, I am proposing that we start to unravel our current mess by defining
+pandas-specific metadata objects that model the current semantics / behavior of
+the project. What does this mean, exactly?
+
+* Each NumPy dtype object will map 1-to-1 to an equivalent ``pandas.DataType``
+  object.
+* Existing pandas "extension dtypes" (like ``CategoricalDtype`` and
+  ``DatetimeTZDtype``), which have been designed to mimic ``numpy.dtype``, will
+  become logical type subclasses of ``pandas.DataType`` like every other type
+  in pandas.
+
+Since pandas is about assisting with data manipulation and analysis, at some
+point you must invoke functions that are specialized to the specific physical
+memory representation of your data. For example, pandas has its own
+implementations of ``ndarray.take`` that are used internally for arrays of
+positive integers that may contain NULL / NA values (which are represented as
+-1 -- search the codebase for implementations of ``take_1d``).
+
+The major goals of introducing a logical type abstraction are the follows:
+
+* Simplifying "dynamic dispatch": invoking the right functions or choosing the
+  right code branches based on the data type.
+* Enabling pandas to decouple both its internal semantics and physical storage
+  from NumPy's metadata and APIs. Note that this is already happening with
+  categorical types, since a particular instance of ``CategoricalDtype`` may
+  physically be stored in one of 4 NumPy data types.
+
+Physical storage decoupling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By separating pandas data from the presumption of using a particular physical
+``numpy.dtype`` internally, we can:
+
+* Begin to better protect users from NumPy data semantics (which are frequently
+  different from pandas's!) leaking through to the pandas user API. This can
+  enable us to address long-standing inconsistencies or "rough edges" in pandas
+  that have persisted due to our tight semantic coupling to NumPy.
+
+* We can consider adding new data structures to pandas, either custom to pandas
+  or provided by 3rd-party libraries, that add new functionality alongside the
+  existing code (presuming NumPy physical storage). As one concrete example,
+  discussed in more detail below, we can enable missing data in integer pandas
+  data by forming a composite data structure consisting of a NumPy array plus a
+  bitmap marking the null / not-null values.
+
+* We can start to think about improved behavior around data ownership (like
+  copy-on-write) which may yield many benefits. I will write a dedicated
+  section about this.
+
+Note that neither of these points implies that we are trying to use NumPy
+less. We already have large amounts of code that implement algorithms similar
+to those found in NumPy (e.g. ``pandas.unique`` or the implementation of
+``Series.sum``), but taking into account pandas's missing data representation,
+etc. Internally, we can use NumPy when its computational semantics match those
+we've chosen for pandas, and elsewhere we can invoke pandas-specific code.
+
+A major concern here based on these ideas is **preserving NumPy
+interoperability**, so I'll examine this topic in some detail next.
+
+Preserving NumPy interoperability
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some of types of intended interoperability between NumPy and pandas are as
+follows:
+
+* Users can obtain the a ``numpy.ndarray`` (possibly a view depending on the
+  internal block structure, more on this soon) in constant time and without
+  copying the actual data. This has a couple other implications
+
+  * Changes made to this array will be reflected in the source pandas object.
+  * If you write C extension code (possibly in Cython) and respect pandas's
+    missing data details, you can invoke certain kinds of fast custom code on
+    pandas data (but it's somewhat inflexible -- see the latest discussion on
+    adding a native code API to pandas).
+
+* NumPy ufuncs (like ``np.sqrt`` or ``np.log``) can be invoked on
+  pandas objects like Series and DataFrame
+
+* ``numpy.asarray`` will always yield some array, even if it discards metadata
+  or has to create a new array. For example ``asarray`` invoked on
+  ``pandas.Categorical`` yields a reconstructed array (rather than either the
+  categories or codes internal arrays)
+
+* Many NumPy methods designed to work on subclasses (or duck-typed classes) of
+  ``ndarray`` may be used. For example ``numpy.sum`` may be used on a Series
+  even though it does not invoke NumPy's internal C sum algorithm. This means
+  that a Series may be used as an interchangeable argument in a large set of
+  functions that only know about NumPy arrays.
+
+By and large, I think much of this can be preserved, but there will be some API
+breakage.
+
+If we add more composite data structures (Categorical can be thought of as
+one existing composite data structure) to pandas or alternate non-NumPy data
+structures, there will be cases where the semantic information in a Series
+cannot be adequately represented in a NumPy array.
+
+As one example, if we add pandas-only missing data support to integer and
+boolean data (a long requested feature), calling ``np.asarray`` on such data
+may not have well-defined behavior. As present, pandas is implicitly converting
+these types to ``float64`` (see more below), which isn't too great. A decision
+does not need to be made now, but the benefits of solving this long-standing
+issue may merit breaking ``asarray`` as long as we provide an explicit way to
+obtain the original casted ``float64`` NumPy array (with ``NaN`` for NULL/NA
+values)
+
+For pandas data that does not step outside NumPy's semantic realm, we can
+continue to provide zero-copy views in many cases.
+
+Missing data consistency
+========================
+
+Once the physical memory representation has been effectively decoupled from the
+user API, we can consider various approaches to implementing missing data in a
+consistent way for every logical pandas data type.
+
+To motivate this, let's look at some integer data:
+
+.. ipython:: python
+
+   s = pd.Series([1, 2, 3, 4, 5])
+   s
+   s.dtype
+   s.values
+
+If we assign a ``numpy.NaN``, see what happens:
+
+.. ipython:: python
+
+   s[2] = np.NaN
+   s
+   s.dtype
+   s.values
+
+The story for boolean data is similar:
+
+.. ipython:: python
+
+   s = pd.Series([True, False, True])
+   s.dtype
+   s[2] = np.NaN
+   s.dtype
+   s.values
+
+This implicit behavior appears in many scenarios, such as:
+
+* Loading data from any source: databases, CSV files, R data files, etc.
+* Joins or reindexing operations introducing missing data
+* Pivot / reshape operations
+* Time series resampling
+* Certain types of GroupBy operations
+
+A proposed solution
+~~~~~~~~~~~~~~~~~~~
+
+My proposal for introducing missing data into any NumPy type outside of
+floating point (which uses ``NaN`` for now) and Python object (which uses
+``None`` or ``NaN`` interchangeably) is to **allocate and manage an internal
+bitmap** (which the user never sees). This has numerous benefits:
+
+* 1 byte of memory overhead for each 8 values
+* Bitmaps can propagate their nulls in C through bitwise ``&`` or ``|``
+  operations, which are inexpensive.
+* Getting and setting bits on modern hardware is CPU-inexpensive. For
+  single-pass array operations (like groupbys) on large arrays this may also
+  result in better CPU cache utilization (fewer main-memory reads of the
+  bitmap).
+* Hardware and SIMD "popcount" intrinsics (which can operate on 64-128 bits at
+  a time) can be used to count bits and skip null-handling on segments of data
+  containing no nulls.
+
+Notably, this is the way that PostgreSQL handles null values. For example, we
+might have:
+
+.. code-block::
+
+   [0, 1, 2, NA, NA, 5, 6, NA]
+
+        i: 7 6 5 4 3 2 1 0
+   bitmap: 0 1 1 0 0 1 1 1
+
+Here, the convention of 1 for "not null" (a la PostgreSQL) and
+least-significant bit ordering (LSB "bit endianness") is being used.
+
+Under the new regime, users could simply write:
+
+.. code-block:: python
+
+   s[2] = pandas.NA
+
+and the data type would be unmodified. It may be necessary to write something
+akin to:
+
+.. code-block:: python
+
+   s.to_numpy(dtype=np.float64, na_rep=np.nan)
+
+and that would emulate the current behavior. Attempts to use ``__array__` (for
+example: calling ``np.sqrt`` on the data) would result in an error since we
+will likely want to refuse to make a guess as for what casting behavior the
+user desires.
+
+Tradeoffs
+~~~~~~~~~
+
+One potential downside of the bitmap approach is that missing data implemented
+outside of NumPy's domain will need to be explicitly converted if it is needed
+in another library that only knows about NumPy. I argue that this is better
+than the current implicit conversion which could yield data loss (for integers
+falling outside the exact representable range for ``float64``).
+
+Removal of BlockManager / new DataFrame internals
+=================================================
+
+Deep inside the belly pandas objects, there is a data structure called
+``BlockManager`` which, at a high level, is responsible for managing the
+physical arrays where the data inside a Series or DataFrame is looked
+after (also Panel / PanelND structure, even though these are on their way to
+deprecation).
+
+While this data structure has served pandas well since its birth 5 years ago
+(Summer 2011), it has a number of problems that make its removal and
+replacement with something else an attractive option.
+
+The goal of this section is to explain what the BlockManager is, why it exists
+at all, and why we should consider removing it.
+
+What is ``BlockManager`` and why does it exist?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The reason that ``BlockManager`` exists at all goes back to some ancient pandas
+history. Originally, the data in ``pandas.DataFrame`` was stored in a Python
+``dict`` object. If you pull up pandas 0.1 or 0.2, you will see this.
+
+Since the business logic of pandas's internals was originally implemented in
+pure Python, as it is still is (but much larger / more complex), there was a
+marked performance difference between column-oriented operations and
+row-oriented operations. The reason for this is not really a memory layout
+issue (NumPy users know about how contiguous memory access produces much better
+performance) so much as a reliance on NumPy's two-dimensional array operations
+for carrying out pandas's computations. So, to do anything row oriented on an
+all-numeric DataFrame, pandas would concatenate all of the columns together
+(using ``numpy.vstack`` or ``numpy.hstack``) then use array broadcasting or
+methods like ``ndarray.sum`` (combined with ``np.isnan`` to mind missing data)
+to carry out certain operations.
+
+1. pandas's early users (i.e. AQR employees) beseeched me to address this
+   performance issue. Thus ``DataMatrix`` was created, a roughly API-equivalent
+   object whose internal storage was a 2D NumPy array, intended to be of a
+   homogeneous type (e.g. ``numpy.float64``). The downside of this was that if
+   you inserted a string column, everything would become ``numpy.object_``
+   dtype. Users did not like that.
+
+2. It had become apparent that the dichotomy between DataFrame and DataMatrix
+   (and when to use each) was harming pandas's adoption and confusing users. So
+   I set about creating a hybrid data structure that had "the best of both
+   worlds".
+
+3. The idea was that the BlockManager would track collections of NumPy arrays
+   having the same dtype, particular as columns were inserted or removed
+   (i.e. the *building* phase of the DataFrame's lifetime).
+
+4. When you would invoke an operation that benefited from a single
+   *consolidated* 2-dimensional ndarray of say ``float64`` dtype (for example:
+   using ``reindex`` or performing a row-oriented operation), the BlockManager
+   would glue together its accumulated pieces to create a single 2D ndarray of
+   each data type. This is called **consolidation** in the codebase.
+
+5. Since in practice, heterogeneous DataFrames had different types interspersed
+   amongst their columns, the BlockManager maintains a mapping between the
+   absolute column position and the relative position within the type-specific
+   2D "block".
+
+6. Over time, the BlockManager has been generalized for the 1 through N
+   dimensional cases, not just the 2D case, so that even Series has a lean
+   "SingleBlockManager" internally.
+
+Drawbacks of BlockManager
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+While this data structure has enabled pandas to make it this far in life, it
+has a number of drawbacks (not a complete list):
+
+1. **Code complexity**: this has manifested in a number of ways (and probably
+   others that I'm missing)
+
+   * Making some of the most important algorithms in pandas fast, like joins
+     and reshape operations, requires carefully constructing the precise block
+     structure of the output DataFrame so that no further copying or
+     consolidation will take place.
+
+   * Adding new custom data types to DataFrame and not losing their metadata
+     (e.g. time zones or categories) has had a sort of "fan out" effect
+     touching numerous parts of the BlockManager internals.
+
+2. **Loss of user visibility into memory use and memory layout**: With large
+   data sets, some "naively" constructed DataFrame objects (e.g. from a dict of
+   ndarrays) can produce a memory-doubling effect that may cause out-of-memory
+   errors. Also, consolidated blocks can (depending on the version of pandas)
+   result in columns having strided / non-contiguous data, resulting in
+   degraded performance in column-oriented operations.
+
+3. **Unavoidable consolidation**: Fairly common operations, like ``read_csv``,
+   may require a consolidation step after completion, which for large data may
+   result in performance or memory overhead (similar to the above bullet
+   point).
+
+4. **Microperformance issues / indexing slowness**: since a DataFrame can be a
+   sort of many-layered onion, many common pandas operations may weave through
+   dozens of different functions navigating the structure of the object and
+   producing the appropriate output. I will talk more about microperformance
+   later.
+
+Replacing BlockManager without weakening pandas
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Our goal in replacing BlockManager would be to achieve:
+
+* Substantially simpler code
+* Easier extensibility with new logical types
+* Performance on par (or better) the current implementation
+* Better user control over memory use and layout
+* Improved microperformance
+
+I believe we can do this, but it's will require a significant inversion of the
+internal code architecture to involve a more native code and less interpreted
+Python. For example, it will be difficult or impossible to achieve comparable
+performance in row-oriented operations (on consolidated DataFrame objects) with
+pure Python code.
+
+In the next section, I will start making my case for creating a "native core"
+library where we can assemble the low level data structures, logical types, and
+memory management for pandas. Additionally, we would want to port much of
+pandas's helper Cython code to live inside this library and operate directly on
+the internal data structures rather than being orchestrated from the Python
+interpreter level.
+
+Building "libpandas" in C++11/14 for lowest level implementation tier
+=====================================================================
+
+Currently, pandas architecturally is structured as follows:
+
+* Pure Python implementation of internal data structure business logic
+* Algorithms in Cython (more often) or C (less often) to accelerate
+  computationally-intensive algorithms
+
+While it's overall made pandas easier to develop and maintain internally
+(perhaps increasingly less so over time!), this has had a number of drawbacks
+as we've discussed. I mentioned microperformance above, so about that:
+
+Microperformance
+~~~~~~~~~~~~~~~~
+
+Microperformance (operations taking 1 microsecond to 1 millisecond) has
+suffered considerably as pandas's internals have expanded to accommodate new
+use cases. Fairly simple operations, from indexing to summary statistics, may
+pass through multiple layers of scaffolding before hitting the lowest tier of
+computations. Let's take for example:
+
+.. ipython:: python
+
+   s = pd.Series(np.random.randn(100))
+   s.sum()
+
+Profiling ``s.sum()`` with ``%prun`` in IPython, I am seeing 116 function
+calls (pandas 0.18.1). Let's look at the microperformance:
+
+.. code-block:: text
+
+   In [14]: timeit s.sum()
+   10000 loops, best of 3: 31.7 µs per loop
+
+   In [15]: v = s.values
+
+   In [16]: timeit v.sum()
+   1000000 loops, best of 3: 1.07 µs per loop
+
+While a slightly contrived example, the internal data structures and function
+dispatch machinery add 30 microseconds of overhead. That may not be a
+compelling number, but such a method called 1 million times has an additional
+30 seconds of overhead. When you consider microperformance in the context of
+custom ``groupby`` operations, for example, this may not be so unrealistic.
+
+C or C++ (C++11, to be specific)?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+At the risk of instigating a religious programming language debate, pandas's
+use of Cython in many places is very C++-like:
+
+* Generic programming through manual code generation (now using tempita)
+  instead of templates
+* Auxiliary types and data structures as ``cdef class`` extension types
+* Relying on Python's reference counting for garbage collection and cleanup
+  after exceptions are raised. The "blend C and Cython" style has aided
+  developer productivity.
+
+I argue that judicious and responsible use of modern C++ (and following a
+reasonable style guide like `Google's guide
+<http://google.github.io/styleguide/cppguide.html>`_, or some slight variation)
+will enable us to:
+
+* Simplify our existing Cython codebase by using templates (and very limited,
+  template metaprogramming)
+
+* Easier generic programming / inlining of data-type specific logic at compile
+  time.
+
+* Use RAII (exception-safe allocation) and smart pointers (``std::unique_ptr``
+  and ``std::shared_ptr``) to simplify memory management
+
+* Define performant C++ classes modeling the current internals, with various
+  mechanisms for code reuse or type-specific dynamic dispatch (i.e. through
+  template classes, CRTP, or simply virtual functions).
+
+* Use C++11 standard library concurrency tools to more easily create concurrent
+  / multithreaded implementations of common pandas algorithms.
+
+By pushing down much of the business logic into C++ (with use of the Python and
+NumPy C API where relevant), we'll be able to achieve macroperformance on par
+or better than the current BlockManager-based implementation and handily better
+microperformance in indexing and simple analytics.
+
+``pandas.Array`` types
+~~~~~~~~~~~~~~~~~~~~~~
+
+My gut feeling is that we would want to create relatively simple container
+classes having a common ``pandas::Array`` base type in C++, each of which
+models a particular logical type. Each array type would have a corresponding
+logical type implementation, in the vein of:
+
+.. code-block:: c++
+
+   class Array {
+     // public API omitted
+     private:
+       std::shared_ptr<DataType> type_;
+   }
+
+   class CategoricalType : public DataType {
+     // implementation
+
+     private:
+       std::shared_ptr<Array> categories_;
+   };
+
+   class CategoricalArray : public Array {
+     public:
+       std::shared_ptr<Array> codes() const;
+       std::shared_ptr<Array> categories() const;
+       // rest of implementation omitted
+   };
+
+An array containing a NumPy array will invoke ``Py_DECREF`` in its destructor,
+so that after construction one can proceed largely with C++ programming
+semantics without much need for manual memory management.
+
+These Array types would be wrapped and exposed to pandas developers (probably
+in Cython).
+
+Index types
+~~~~~~~~~~~
+
+Like pandas's current code structure, Index types would be composed from the
+Array types and some additional data structures (hash tables) for lookups and
+other index operations. These can be similarly exposed to the world via Cython
+(and wrapped in a convenient pandas.Index class).
+
+``pandas.Table``
+~~~~~~~~~~~~~~~~
+
+My recommendation is to decommission the BlockManager in favor of a much
+simpler low-level Table class, which operates more similarly to an R data.frame
+(e.g. no row index). This would look something like
+
+.. code-block:: c++
+
+   class Table {
+     public:
+       std::shared_ptr<Array> GetColumn(int i);
+       void SetColumn(int i, const std::shared_ptr<Array>& arr);
+
+       // rest of public API omitted
+     private:
+       // Column index, possibly not necessary
+       std::shared_ptr<Index> columns_;
+
+       // List of arrays
+       std::vector<std::shared_ptr<Array>> data_;
+   };
+
+Operators and dynamic dispatch
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Under this proposed class structure, it may not make sense to add operations as
+class methods. We could possibly do something like:
+
+.. code-block:: c++
+
+   #include "pandas/dispatch.h"
+
+   // other includes omitted
+
+   using ArrayRef = std::shared_ptr<Array>;
+
+   template <typename U, typename V>
+   inline ArrayRef TakeImpl(U, V) {
+     // Implementation omitted
+   }
+
+   ArrayRef Take(ArrayRef values, ArrayRef indices) {
+     return Dispatch<TakeImpl>(values, indices);
+   }
+
+Here, the Dispatch template would generate the matrix of logical type
+combinations, some of which might throw a not implemented exception.
+
+There's other approaches to dealing with runtime dispatch that don't feature
+too much overhead.
+
+Memory accounting
+~~~~~~~~~~~~~~~~~
+
+If pandas's internals are encapsulated in C++ classes inside the libpandas core
+library, we could atomically track all memory allocations and deallocations to
+produce a precise accounting of the number of bytes that pandas has currently
+allocated (that are not opaque, so Python objects would only include their
+``PyObject**`` array footprint).
+
+Development toolchain
+~~~~~~~~~~~~~~~~~~~~~
+
+Introducing C++11 to pandas's development toolchain will add quite a bit of
+complexity for developers, especially compared with pandas's current Cython and
+C codebase which basically builds out of the box for most people. It would be
+better for cross-platform support to use CMake than something else (distutils
+doesn't have adequate support for C++).
+
+Logical types for strings and possibly other non-numeric data
+=============================================================
+
+I believe that frequently-occurring data types, such as UTF8 strings, are
+important enough to deserve a dedicated logical pandas data type. This will
+enable us both to enforce tighter API semantics (i.e. attempts to assign a
+non-string into string data will be a ``TypeError``) and improved performance
+and memory use under the hood. I will devote an entire section to talking about
+strings.
+
+In general, I would be supportive of making Python object (``numpy.object_``
+dtype) arrays the solution only for mixed-type arrays and data types for which
+pandas has no native handling.
+
+3rd-party native API (i.e. Cython and C / C++)
+==============================================
+
+Developers of 3rd-party projects (myself included) have often expressed a
+desire to be able to inspect, construct, or otherwise manipulate pandas objects
+(if even in a limited fashion) in compiled code (Cython, C, or C++).
+
+Per the discussion of libpandas and a native core, I would propose the
+following:
+
+* Define public-facing ``.pxd`` files that allow developers to use ``cimport``
+  and get access to pandas's internal extension types.
+* Define factory function that enable fully formed Series and DataFrame objects
+  to be constructed either by Cython API calls or potentially also C++
+  libpandas API calls.
+* Provide Cython APIs for 3rd-party developers to obtain pointers to access the
+  underlying C++ objects contained in the wrapper Python objects
diff --git a/doc/pandas-2.0/source/removals.rst b/doc/pandas-2.0/source/removals.rst
new file mode 100644
index 0000000000000..5f10485b31405
--- /dev/null
+++ b/doc/pandas-2.0/source/removals.rst
@@ -0,0 +1,78 @@
+.. _removals:
+
+================================
+ Code to remove and other ideas
+================================
+
+Dropping Python 2 support
+=========================
+
+With Python 2.7 reaching its supported end-of-life in 2020, like some other
+Python projects (e.g. IPython / Jupyter) we should seriously contemplate making
+pandas 2.0 only support Python 3.5 and higher. In addition to lowering the
+development burden at both the C API and pure Python level, we can also finally
+look to take advantage of features (things like ``asyncio``, maybe?) only
+available in Python 3.
+
+Deprecated code to remove
+=========================
+
+* ``.ix`` indexing entirely
+* ``Panel`` and ``PanelND`` classes
+* Plotting?
+
+Other ideas
+===========
+
+Here's a collection of other miscellaneous ideas that don't necessarily fit
+elsewhere in these documents.
+
+Column statistics
+~~~~~~~~~~~~~~~~~
+
+In quite a few pandas algorithms, there are characteristics of the data that
+are very useful to know, such as:
+
+* **Monotonicity**: for comparable data (e.g. numbers), is the data sorted /
+  strictly increasing? In time series, this permits sorting steps to be
+  skipped.
+
+* **Null count**: for data not containing any nulls, the null handling path in
+  some algorithms can be skipped entirely
+
+
+
+Strided arrays: more trouble than they are worth?
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Per the general discussion around changing DataFrame's internals to contain a
+list / ``std::vector`` of arrays internally, for me this begs the question of
+the benefits of continuing to accommodate strided one-dimensional data.
+
+Some pros for eliminating strided data completely:
+
+* Guaranteeing contiguous memory internally will yield more consistent and
+  predictable performance.
+
+* Not needing to consider a stride different from 1 means simpler low-level
+  array indexing code (e.g. you can work with plain C arrays). The stride is a
+  complexity / overhead that leaks to every algorithm that iterates over an
+  array.
+
+* You avoid strange situations where a strided view holds onto a base ndarray
+  reference to a much larger array
+
+* **Example:** `<https://github.com/wesm/feather/issues/97>`_. Here, the
+  internal orientation (column-major vs. row-major) is not clear to the user.
+
+Some cons:
+
+* It would not be possible to perform zero-copy computations on a strided NumPy
+  array
+
+* Relatedly, initializing a Series or DataFrame from strided memory would
+  require allocating an equivalent amount of contiguous memory for each of the
+  columns.
+
+For me, at least, I don't find the cons compelling enough to warrant the code
+complexity tradeoff.
diff --git a/doc/pandas-2.0/source/strings.rst b/doc/pandas-2.0/source/strings.rst
new file mode 100644
index 0000000000000..aa065cee4844f
--- /dev/null
+++ b/doc/pandas-2.0/source/strings.rst
@@ -0,0 +1,195 @@
+.. _strings:
+
+.. ipython:: python
+   :suppress:
+
+   import numpy as np
+   import pandas as pd
+   np.set_printoptions(precision=4, suppress=True)
+   pd.options.display.max_rows = 100
+
+==================================
+ Enhanced string / UTF-8 handling
+==================================
+
+There are some things we can do to make pandas use less memory and perform
+computations significantly faster on string data.
+
+Current string problems
+=======================
+
+pandas offers support for columns containing strings (ASCII or Unicode) on a
+somewhat ad hoc basis.
+
+* Strings are stored in NumPy arrays of ``PyObject*`` / ``numpy.object_``
+  dtype. This has several problems
+
+  * Computations (e.g. ``groupby`` operations) typically utilize a code path
+    for generic Python objects. For example comparisons or hashing goes through
+    the ``PyObject_*`` C API functions. In addition to harming multithreading
+    due to GIL contention (you must acquire the GIL to use these functions),
+    these can also be significantly slower than algorithms that operate on
+    ``const char*``, potentially taking advantage of hardware optimizations.
+
+  * String arrays often feature many copies of or references to the same
+    PyString. Thus, some algorithms may perform redundant computation. Some
+    parts of pandas, like ``pandas.read_csv``, make an effort to deduplicate
+    strings to free memory and accelerate computations (e.g. if you do ``x ==
+    y``, and ``x`` and ``y`` are references to the same ``PyObject*``, Python
+    skips comparing their internal data).
+
+    * Note that this is somewhat mitigated by using ``pandas.Categorical``, but
+      this is not the default storage mechanism. More on this below.
+
+  * Using ``PyString`` objects and ``PyObject*`` NumPy storage adds non-trivial
+    overhead (approximately 24 bytes per unique object, see `this exposition
+    <http://www.gahcep.com/python-internals-pyobject/>`_ for a deeper drive) to
+    each value.
+
+Possible solution: new non-NumPy string memory layout
+=====================================================
+
+My proposed solution to the string conundrum is the following:
+
+* Create a custom string array container type suitable for use in a
+  ``pandas.Array``, and a ``pandas.string`` logical data type.
+* Require that all strings be encoded as UTF-8.
+* By default, represent all string arrays internally as dictionary-encoded
+  a.k.a. categorical. Thus, we will typically only ever have 1 copy of any
+  given string in an array.
+* Store the actual string data in a packed UTF-8 buffer. I have seen this in a
+  number of places, but notably it's the way that `Apache Arrow implements
+  variable-length collections
+  <https://github.com/apache/arrow/blob/master/format/Layout.md#list-type>`_.
+
+Here is one possible C struct-like layout of this container:
+
+.. code-block:: c++
+
+   typedef struct {
+     /* Category / dictionary indices into the string data */
+     uint32_t* indices;
+
+     /* The encoded string lengths */
+     uint32_t* offsets;
+
+     /* The packed UTF-8 data */
+     const char* data;
+
+     /* For nullness */
+     uint8_t* bitmap;
+   } string_array_t;
+
+Here's an example of what the data would look like:
+
+.. code-block:: text
+
+   actual data : ['foo', 'bars', 'foo', null, 'bars']
+
+   indices: [0, 1, 0, 0, 1]
+
+                                    bitmap[0]
+   bitmap (read right-to-left): 0 0 0 1 0 1 1 1 |
+
+   offsets: [0, 3, 7]
+   data: ['f', 'o', 'o', 'b', 'a', 'r', 's']
+
+Some benefits of this approach include:
+
+* Much better data locality for low-cardinality categorical data
+* 8.125 bytes (8 bytes plus 1 bit) of memory overhead per value versus 24 bytes
+  (the current)
+* The data is already categorical: cast to ``category`` dtype can be perform
+  very cheaply and without duplicating the underlying string memory buffer
+* Computations like ``groupby`` on dictionary-encoded strings will be as
+  performant as those on Categorical currently are.  performant
+
+Some drawbacks
+
+* This memory layout is best used as an immutable representation. Mutating
+  slots here becomes more complex. Whether single value assignments or put /
+  array-assignment may likely require constructing a new ``data`` buffer
+  (either by ``realloc`` or some other copying mechanism). Without a compaction
+  / "garbage collection" step on this buffer it will be possible to have "dead"
+  memory inside it (for example, if you did ``arr[:] = 'a-new-string-value'``,
+  all the existing values would be orphaned).
+
+  * Some systems have addressed this issue by storing all string data in a
+    "global string hash table". This is something we could explore, but it
+    would add quite a bit of complexity to implement and may not be worthwhile
+    at this time.
+
+* Indexing into this data structure to obtain a single Python object will
+  probably want to call ``PyUnicode_FromStringAndSize`` to construct a string
+  (Python 3, therefore Unicode). This requires a memory allocation, whereas it
+  currently only has to do a ``Py_INCREF``.
+
+* Many of pandas's existing algorithms assuming Python objects would need to be
+  specialized to take advantage of this new memory layout. This is both a pro
+  and a con as it will most likely yield significantly better performance.
+
+Concerns / problems
+===================
+
+Preserving code that assumes PyString objects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Any alternate UTF-8 string in-memory representation should necessarily be able
+to yield Python string objects using ``PyUnicode_FromStringAndSize``. Thus,
+code like this could continue to work:
+
+.. ipython:: python
+
+   s = pd.Series(["como estás?"])
+   s.map(lambda x: x.upper())
+
+One trade-off is that creating the temporary Python strings is potentially
+costly. This could be mitigated for Python ``str`` methods (optimized
+array-oriented code path under the hood), but for arbitrary functions you would
+have to pay.
+
+Accommodating Non-UTF-8 data
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some pandas users will have code that involves various non-UTF-8 Python string
+types:
+
+* Native unicode: Py_UCS1, Py_UCS2, Py_UCS4
+* Non-UTF-8 PyBytes
+
+.. ipython:: python
+
+   s = pd.Series(["como estás?"])
+   s
+   s.str.encode('latin-1')
+   s.str.encode('latin-1').str.decode('latin-1')
+
+Such data could arise from reading a CSV file in a non-UTF-8 encoding, and you
+did not indicate the encoding to ``pandas.read_csv``.
+
+My proposed solution to this is to provide a ``binary`` logical type having the
+same physical memory layout as UTF-8 strings, with only the metadata being
+different. So you would have the following semantics:
+
+* ``latin1_s = s.encode('latin-1')``: this yields a ``binary`` view and
+  allocates new memory.
+* ``utf8_s = s.encode('utf-8')``: this is a no-op, but yields a ``binary`` view.
+* ``s2 = utf8_s.decode('utf-8')``: this requires using a Unicode codec to
+  validate indicated codec.
+
+Indexing and slicing
+~~~~~~~~~~~~~~~~~~~~
+
+Storing strings as UTF-8 bytes means that things like this become more
+complicated:
+
+.. ipython:: python
+
+   s = pd.Series(["estás está estáis"])
+   s.str[9]
+   s.str[6:10]
+
+Since UTF-8 is a variable length encoding, finding the logical character by
+position will need to make use of the Python C API (expensive, requires
+creating new Python objects) or a 3rd party library. We could make use of the
+`ICU C++ Libraries <http://site.icu-project.org/>`_ to implement this.