Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rewrite Windows backend to use IOCP exclusively #1269

Merged
merged 30 commits into from
Oct 31, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
849055b
On Windows, run tests with LSPs installed
njsmith Oct 24, 2019
22b07e5
Improve notify_closing docs
njsmith Oct 24, 2019
c8a63de
Checkpoint: basic AFD support working
njsmith Oct 24, 2019
f7ac5aa
Checkpoint: a beautiful but doomed approach
njsmith Oct 24, 2019
d17431b
Rewritten and working (?) IOCP-only windows backend
njsmith Oct 25, 2019
e877b4b
Merge branch 'master' of github.com:python-trio/trio into deselect
njsmith Oct 25, 2019
d73b3cf
Remove debug prints
njsmith Oct 25, 2019
c8efc39
yapf
njsmith Oct 25, 2019
3fdfbe9
Remove a bit more debug code
njsmith Oct 25, 2019
d6d8fac
Move LSP tests up to the top of the azure pipelines order
njsmith Oct 25, 2019
1a12bcd
Don't use enum.IntFlag on python 3.5
njsmith Oct 25, 2019
8c549f3
Re-run gen_exports.py
njsmith Oct 25, 2019
1a8ecbe
remove more debug code
njsmith Oct 25, 2019
45bd65c
remove stale comment
njsmith Oct 25, 2019
86eb6ef
Add test for how notify_closing handles bad input
njsmith Oct 25, 2019
29b9d30
Add some pragma: no cover to errors we think can't happen
njsmith Oct 25, 2019
630910c
Rename Windows backend statistics attributes to match epoll backend
njsmith Oct 25, 2019
2f1519f
Convert epoll statistics test into generic IO statistics test
njsmith Oct 25, 2019
c558574
Add notes-to-self/ to document the weird simultaneous-poll bug
njsmith Oct 27, 2019
0320a8b
Make our set of poll flags more complete
njsmith Oct 27, 2019
a28dfa0
Better comments
njsmith Oct 27, 2019
9140391
Let's be paranoid and double-check for weird broken network configs
njsmith Oct 27, 2019
c604716
Minor cleanups to test_io_manager_statistics
njsmith Oct 27, 2019
e4da787
Add explicit test that wait_* error out properly on invalid values
njsmith Oct 27, 2019
b2b84d0
Apparently you get LOCAL_CLOSE notifications whether you want them or…
njsmith Oct 27, 2019
2ce3bcc
Rewrite newsfragment to explain the change better
njsmith Oct 27, 2019
f9c3b54
Wording tweak
njsmith Oct 29, 2019
7818f58
Update comments to clarify the impact of the AFD_IOCTL_POLL bug
njsmith Oct 29, 2019
ef2d637
Add script to check how wait_readable scales with the number of sockets
njsmith Oct 31, 2019
0b9af3b
Tweak newsfragment again
njsmith Oct 31, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,24 @@ jobs:
- job: 'Windows'
pool:
vmImage: 'vs2017-win2016'
timeoutInMinutes: 10
timeoutInMinutes: 20
strategy:
# Python version list:
# 64-bit: https://www.nuget.org/packages/python/
# 32-bit: https://www.nuget.org/packages/pythonx86/
matrix:
# The LSP tests can be super slow for some reason - like
# sometimes it just randomly takes 5 minutes to run the LSP
# installer. So we put them at the top, so they can get started
# earlier.
"with IFS LSP, Python 3.7, 64 bit":
python.version: '3.7.2'
python.pkg: 'python'
lsp: 'http://www.proxifier.com/download/ProxifierSetup.exe'
"with non-IFS LSP, Python 3.7, 64 bit":
python.version: '3.7.2'
python.pkg: 'python'
lsp: 'http://download.pctools.com/mirror/updates/9.0.0.2308-SDavfree-lite_en.exe'
"Python 3.5, 32 bit":
python.version: '3.5.4'
python.pkg: 'pythonx86'
Expand Down
47 changes: 40 additions & 7 deletions ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ else
CODECOV_NAME="${TRAVIS_OS_NAME}-${TRAVIS_PYTHON_VERSION:-unknown}"
fi

# We always want to retry on failure, and we have to set --connect-timeout to
# work around a curl bug:
# https://github.com/curl/curl/issues/4461
CURL="curl --connect-timeout 5 --retry 5"

################################################################
# Bootstrap python environment, if necessary
################################################################
Expand Down Expand Up @@ -47,12 +52,12 @@ fi

if [ "$TRAVIS_OS_NAME" = "osx" ]; then
CODECOV_NAME="osx_${MACPYTHON}"
curl -Lo macpython.pkg https://www.python.org/ftp/python/${MACPYTHON}/python-${MACPYTHON}-macosx10.6.pkg
$CURL -Lo macpython.pkg https://www.python.org/ftp/python/${MACPYTHON}/python-${MACPYTHON}-macosx10.6.pkg
sudo installer -pkg macpython.pkg -target /
ls /Library/Frameworks/Python.framework/Versions/*/bin/
PYTHON_EXE=/Library/Frameworks/Python.framework/Versions/*/bin/python3
# The pip in older MacPython releases doesn't support a new enough TLS
curl https://bootstrap.pypa.io/get-pip.py | sudo $PYTHON_EXE
$CURL https://bootstrap.pypa.io/get-pip.py | sudo $PYTHON_EXE
sudo $PYTHON_EXE -m pip install virtualenv
$PYTHON_EXE -m virtualenv testenv
source testenv/bin/activate
Expand All @@ -62,7 +67,7 @@ fi

if [ "$PYPY_NIGHTLY_BRANCH" != "" ]; then
CODECOV_NAME="pypy_nightly_${PYPY_NIGHTLY_BRANCH}"
curl -fLo pypy.tar.bz2 http://buildbot.pypy.org/nightly/${PYPY_NIGHTLY_BRANCH}/pypy-c-jit-latest-linux64.tar.bz2
$CURL -fLo pypy.tar.bz2 http://buildbot.pypy.org/nightly/${PYPY_NIGHTLY_BRANCH}/pypy-c-jit-latest-linux64.tar.bz2
if [ ! -s pypy.tar.bz2 ]; then
# We know:
# - curl succeeded (200 response code; -f means "exit with error if
Expand Down Expand Up @@ -115,12 +120,40 @@ else
# Actual tests
python -m pip install -r test-requirements.txt

# If we're testing with a LSP installed, then it might break network
# stuff, so wait until after we've finished setting everything else
# up.
if [ "$LSP" != "" ]; then
echo "Installing LSP from ${LSP}"
$CURL -o lsp-installer.exe "$LSP"
# Double-slashes are how you tell windows-bash that you want a single
# slash, and don't treat this as a unix-style filename that needs to
# be replaced by a windows-style filename.
# http://www.mingw.org/wiki/Posix_path_conversion
./lsp-installer.exe //silent //norestart
echo "Waiting for LSP to appear in Winsock catalog"
while ! netsh winsock show catalog | grep "Layered Chain Entry"; do
sleep 1
done
netsh winsock show catalog
fi

mkdir empty
cd empty

INSTALLDIR=$(python -c "import os, trio; print(os.path.dirname(trio.__file__))")
cp ../setup.cfg $INSTALLDIR
pytest -W error -r a --junitxml=../test-results.xml --run-slow ${INSTALLDIR} --cov="$INSTALLDIR" --cov-config=../.coveragerc --verbose
if pytest -W error -r a --junitxml=../test-results.xml --run-slow ${INSTALLDIR} --cov="$INSTALLDIR" --cov-config=../.coveragerc --verbose; then
PASSED=true
else
PASSED=false
fi

# Remove the LSP again; again we want to do this ASAP to avoid
# accidentally breaking other stuff.
if [ "$LSP" != "" ]; then
netsh winsock reset
fi

# Disable coverage on 3.8 until we run 3.8 on Windows CI too
# https://github.com/python-trio/trio/pull/784#issuecomment-446438407
Expand All @@ -143,9 +176,9 @@ else
# bash <(curl ...)
# but azure is broken:
# https://developercommunity.visualstudio.com/content/problem/743824/bash-task-on-windows-suddenly-fails-with-bash-devf.html
# Also we have to set --connect-timeout to work around:
# https://github.com/curl/curl/issues/4461
curl --connect-timeout 5 --retry 5 -o codecov.sh https://codecov.io/bash
$CURL -o codecov.sh https://codecov.io/bash
bash codecov.sh -n "${CODECOV_NAME}" -F "$FLAG"
fi

$PASSED
fi
22 changes: 18 additions & 4 deletions docs/source/reference-hazmat.rst
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,27 @@ All environments provide the following functions:

.. function:: notify_closing(obj)

Call this before closing a file descriptor or ``SOCKET`` handle
that another task might be waiting on. This will cause any
`wait_readable` or `wait_writable` calls to immediately raise
Call this before closing a file descriptor (on Unix) or socket (on
Windows). This will cause any `wait_readable` or `wait_writable`
calls on the given object to immediately wake up and raise
`~trio.ClosedResourceError`.

This doesn't actually close the object – you still have to do that
yourself afterwards.
yourself afterwards. Also, you want to be careful to make sure no
new tasks start waiting on the object in between when you call this
and when it's actually closed. So to close something properly, you
usually want to do these steps in order:

1. Explicitly mark the object as closed, so that any new attempts
to use it will abort before they start.
2. Call `notify_closing` to wake up any already-existing users.
3. Actually close the object.

It's also possible to do them in a different order if that's more
convenient, *but only if* you make sure not to have any checkpoints in
between the steps. This way they all happen in a single atomic
step, so other tasks won't be able to tell what order they happened
in anyway.


Unix-specific API
Expand Down
19 changes: 19 additions & 0 deletions newsfragments/52.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
On Windows, the `IOCP subsystem
<https://docs.microsoft.com/en-us/windows/win32/fileio/i-o-completion-ports>`__
is generally the best way to implement async I/O operations – but it's
historically been weak at providing ``select``\-style readiness
notifications, like `trio.hazmat.wait_readable` and
`~trio.hazmat.wait_writable`. We aren't willing to give those up, so
previously Trio's Windows backend used a hybrid of ``select`` + IOCP.
This was complex, slow, and had `limited scalability
<https://github.com/python-trio/trio/issues/3>`__.

Fortunately, we found a way to implement ``wait_*`` with IOCP, so
Trio's Windows backend has been completely rewritten, and now uses
IOCP exclusively. As a user, the only difference you should notice is
that Trio should now be faster on Windows, and can handle many more
sockets. This also simplified the code internally, which should allow
for more improvements in the future.

However, this is somewhat experimental, so if you use Windows then
please keep an eye out and let us know if you run into any problems!
176 changes: 176 additions & 0 deletions notes-to-self/afd-lab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
# A little script to experiment with AFD polling.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible for the IOCP-based I/O manager implemented in this PR to run into the bug elicited by this script? If not, maybe comment why not?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, this bug is why we have all this machinery to avoid ever issuing more than one IOCTL_AFD_POLL operation on the same socket at the same time :-). Updated comments in both afd-lab.py and _io_windows.py.

#
# This cheats and uses a bunch of internal APIs. Don't follow its example. The
# point is just to experiment with random junk that probably won't work, so we
# can figure out what we actually do want to do internally.

# Currently this demonstrates what seems to be a weird bug in the Windows
# kernel. If you:
#
# 0. Set up a socket so that it's not writable.
# 1. Submit a SEND poll operation.
# 2. Submit a RECEIVE poll operation.
# 3. Send some data through the socket, to trigger the RECEIVE.
#
# ...then the SEND poll operation completes with the RECEIVE flag set.
#
# (This bug is why our Windows backend jumps through hoops to avoid ever
# issuing multiple polls simultaneously for the same socket.)
#
# This script's output on my machine:
#
# -- Iteration start --
# Starting a poll for <AFDPollFlags.AFD_POLL_SEND: 4>
# Starting a poll for <AFDPollFlags.AFD_POLL_RECEIVE: 1>
# Sending another byte
# Poll for <AFDPollFlags.AFD_POLL_SEND: 4>: got <AFDPollFlags.AFD_POLL_RECEIVE: 1>
# Poll for <AFDPollFlags.AFD_POLL_RECEIVE: 1>: Cancelled()
# -- Iteration start --
# Starting a poll for <AFDPollFlags.AFD_POLL_SEND: 4>
# Starting a poll for <AFDPollFlags.AFD_POLL_RECEIVE: 1>
# Poll for <AFDPollFlags.AFD_POLL_RECEIVE: 1>: got <AFDPollFlags.AFD_POLL_RECEIVE: 1> Sending another byte
# Poll for <AFDPollFlags.AFD_POLL_SEND: 4>: got <AFDPollFlags.AFD_POLL_RECEIVE: 1>
#
# So what we're seeing is:
#
# On the first iteration, where there's initially no data in the socket, the
# SEND completes with the RECEIVE flag set, and the RECEIVE operation doesn't
# return at all, until we cancel it.
#
# On the second iteration, there's already data sitting in the socket from the
# last loop. This time, the RECEIVE returns immediately with the RECEIVE flag
# set, which makes sense -- when starting a RECEIVE poll, it does an immediate
# check to see if there's data already, and if so it does an early exit. But
# the bizarre thing is, when we then send *another* byte of data, the SEND
# operation wakes up with the RECEIVE flag set.
#
# Why is this bizarre? Let me count the ways:
#
# - The SEND operation should never return RECEIVE.
#
# - If it does insist on returning RECEIVE, it should do it immediately, since
# there is already data to receive. But it doesn't.
#
# - And then when we send data into a socket that already has data in it, that
# shouldn't have any effect at all! But instead it wakes up the SEND.
#
# - Also, the RECEIVE call did an early check for data and exited out
# immediately, without going through the whole "register a callback to
# be notified when data arrives" dance. So even if you do have some bug
# in tracking which operations should be woken on which state transitions,
# there's no reason this operation would even touch that tracking data. Yet,
# if we take out the brief RECEIVE, then the SEND *doesn't* wake up.
#
# - Also, if I move the send() call up above the loop, so that there's already
# data in the socket when we start our first iteration, then you would think
# that would just make the first iteration act like it was the second
# iteration. But it doesn't. Instead it makes all the weird behavior
# disappear entirely.
#
# "What do we know … of the world and the universe about us? Our means of
# receiving impressions are absurdly few, and our notions of surrounding
# objects infinitely narrow. We see things only as we are constructed to see
# them, and can gain no idea of their absolute nature. With five feeble senses
# we pretend to comprehend the boundlessly complex cosmos, yet other beings
# with wider, stronger, or different range of senses might not only see very
# differently the things we see, but might see and study whole worlds of
# matter, energy, and life which lie close at hand yet can never be detected
# with the senses we have."

import sys
import os.path
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__) + r"\.."))

import trio
print(trio.__file__)
import trio.testing
import socket

from trio._core._windows_cffi import (
ffi, kernel32, AFDPollFlags, IoControlCodes, ErrorCodes
)
from trio._core._io_windows import (
_get_base_socket, _afd_helper_handle, _check
)

class AFDLab:
def __init__(self):
self._afd = _afd_helper_handle()
trio.hazmat.register_with_iocp(self._afd)

async def afd_poll(self, sock, flags, *, exclusive=0):
print(f"Starting a poll for {flags!r}")
lpOverlapped = ffi.new("LPOVERLAPPED")
poll_info = ffi.new("AFD_POLL_INFO *")
poll_info.Timeout = 2**63 - 1 # INT64_MAX
poll_info.NumberOfHandles = 1
poll_info.Exclusive = exclusive
poll_info.Handles[0].Handle = _get_base_socket(sock)
poll_info.Handles[0].Status = 0
poll_info.Handles[0].Events = flags

try:
_check(
kernel32.DeviceIoControl(
self._afd,
IoControlCodes.IOCTL_AFD_POLL,
poll_info,
ffi.sizeof("AFD_POLL_INFO"),
poll_info,
ffi.sizeof("AFD_POLL_INFO"),
ffi.NULL,
lpOverlapped,
)
)
except OSError as exc:
if exc.winerror != ErrorCodes.ERROR_IO_PENDING: # pragma: no cover
raise

try:
await trio.hazmat.wait_overlapped(self._afd, lpOverlapped)
except:
print(f"Poll for {flags!r}: {sys.exc_info()[1]!r}")
raise
out_flags = AFDPollFlags(poll_info.Handles[0].Events)
print(f"Poll for {flags!r}: got {out_flags!r}")
return out_flags


def fill_socket(sock):
try:
while True:
sock.send(b"x" * 65536)
except BlockingIOError:
pass


async def main():
afdlab = AFDLab()

a, b = socket.socketpair()
a.setblocking(False)
b.setblocking(False)

fill_socket(a)

while True:
print("-- Iteration start --")
async with trio.open_nursery() as nursery:
nursery.start_soon(
afdlab.afd_poll,
a,
AFDPollFlags.AFD_POLL_SEND,
)
await trio.sleep(2)
nursery.start_soon(
afdlab.afd_poll,
a,
AFDPollFlags.AFD_POLL_RECEIVE,
)
await trio.sleep(2)
print("Sending another byte")
b.send(b"x")
await trio.sleep(2)
nursery.cancel_scope.cancel()

trio.run(main)
Loading