python-trio · njsmith · Oct 31, 2019 · Oct 24, 2019 · Oct 24, 2019 · Oct 24, 2019
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -39,12 +39,24 @@ jobs:
 - job: 'Windows'
  pool:
  vmImage: 'vs2017-win2016'
- timeoutInMinutes: 10
+ timeoutInMinutes: 20
  strategy:
  # Python version list:
  # 64-bit: https://www.nuget.org/packages/python/
  # 32-bit: https://www.nuget.org/packages/pythonx86/
  matrix:
+ # The LSP tests can be super slow for some reason - like
+ # sometimes it just randomly takes 5 minutes to run the LSP
+ # installer. So we put them at the top, so they can get started
+ # earlier.
+ "with IFS LSP, Python 3.7, 64 bit":
+ python.version: '3.7.2'
+ python.pkg: 'python'
+ lsp: 'http://www.proxifier.com/download/ProxifierSetup.exe'
+ "with non-IFS LSP, Python 3.7, 64 bit":
+ python.version: '3.7.2'
+ python.pkg: 'python'
+ lsp: 'http://download.pctools.com/mirror/updates/9.0.0.2308-SDavfree-lite_en.exe'
  "Python 3.5, 32 bit":
  python.version: '3.5.4'
  python.pkg: 'pythonx86'

diff --git a/ci.sh b/ci.sh
@@ -12,6 +12,11 @@ else
  CODECOV_NAME="${TRAVIS_OS_NAME}-${TRAVIS_PYTHON_VERSION:-unknown}"
 fi
 
+# We always want to retry on failure, and we have to set --connect-timeout to
+# work around a curl bug:
+# https://github.com/curl/curl/issues/4461
+CURL="curl --connect-timeout 5 --retry 5"
+
 ################################################################
 # Bootstrap python environment, if necessary
 ################################################################
@@ -47,12 +52,12 @@ fi
 
 if [ "$TRAVIS_OS_NAME" = "osx" ]; then
  CODECOV_NAME="osx_${MACPYTHON}"
- curl -Lo macpython.pkg https://www.python.org/ftp/python/${MACPYTHON}/python-${MACPYTHON}-macosx10.6.pkg
+ $CURL -Lo macpython.pkg https://www.python.org/ftp/python/${MACPYTHON}/python-${MACPYTHON}-macosx10.6.pkg
  sudo installer -pkg macpython.pkg -target /
  ls /Library/Frameworks/Python.framework/Versions/*/bin/
  PYTHON_EXE=/Library/Frameworks/Python.framework/Versions/*/bin/python3
  # The pip in older MacPython releases doesn't support a new enough TLS
- curl https://bootstrap.pypa.io/get-pip.py | sudo $PYTHON_EXE
+ $CURL https://bootstrap.pypa.io/get-pip.py | sudo $PYTHON_EXE
  sudo $PYTHON_EXE -m pip install virtualenv
  $PYTHON_EXE -m virtualenv testenv
  source testenv/bin/activate
@@ -62,7 +67,7 @@ fi
 
 if [ "$PYPY_NIGHTLY_BRANCH" != "" ]; then
  CODECOV_NAME="pypy_nightly_${PYPY_NIGHTLY_BRANCH}"
- curl -fLo pypy.tar.bz2 http://buildbot.pypy.org/nightly/${PYPY_NIGHTLY_BRANCH}/pypy-c-jit-latest-linux64.tar.bz2
+ $CURL -fLo pypy.tar.bz2 http://buildbot.pypy.org/nightly/${PYPY_NIGHTLY_BRANCH}/pypy-c-jit-latest-linux64.tar.bz2
  if [ ! -s pypy.tar.bz2 ]; then
  # We know:
  # - curl succeeded (200 response code; -f means "exit with error if
@@ -115,12 +120,40 @@ else
  # Actual tests
  python -m pip install -r test-requirements.txt
 
+ # If we're testing with a LSP installed, then it might break network
+ # stuff, so wait until after we've finished setting everything else
+ # up.
+ if [ "$LSP" != "" ]; then
+ echo "Installing LSP from ${LSP}"
+ $CURL -o lsp-installer.exe "$LSP"
+ # Double-slashes are how you tell windows-bash that you want a single
+ # slash, and don't treat this as a unix-style filename that needs to
+ # be replaced by a windows-style filename.
+ # http://www.mingw.org/wiki/Posix_path_conversion
+ ./lsp-installer.exe //silent //norestart
+ echo "Waiting for LSP to appear in Winsock catalog"
+ while ! netsh winsock show catalog | grep "Layered Chain Entry"; do
+ sleep 1
+ done
+ netsh winsock show catalog
+ fi
+
  mkdir empty
  cd empty
 
  INSTALLDIR=$(python -c "import os, trio; print(os.path.dirname(trio.__file__))")
  cp ../setup.cfg $INSTALLDIR
- pytest -W error -r a --junitxml=../test-results.xml --run-slow ${INSTALLDIR} --cov="$INSTALLDIR" --cov-config=../.coveragerc --verbose
+ if pytest -W error -r a --junitxml=../test-results.xml --run-slow ${INSTALLDIR} --cov="$INSTALLDIR" --cov-config=../.coveragerc --verbose; then
+ PASSED=true
+ else
+ PASSED=false
+ fi
+
+ # Remove the LSP again; again we want to do this ASAP to avoid
+ # accidentally breaking other stuff.
+ if [ "$LSP" != "" ]; then
+ netsh winsock reset
+ fi
 
  # Disable coverage on 3.8 until we run 3.8 on Windows CI too
  # https://github.com/python-trio/trio/pull/784#issuecomment-446438407
@@ -143,9 +176,9 @@ else
  # bash <(curl ...)
  # but azure is broken:
  # https://developercommunity.visualstudio.com/content/problem/743824/bash-task-on-windows-suddenly-fails-with-bash-devf.html
- # Also we have to set --connect-timeout to work around:
- # https://github.com/curl/curl/issues/4461
- curl --connect-timeout 5 --retry 5 -o codecov.sh https://codecov.io/bash
+ $CURL -o codecov.sh https://codecov.io/bash
  bash codecov.sh -n "${CODECOV_NAME}" -F "$FLAG"
  fi
+
+ $PASSED
 fi
diff --git a/docs/source/reference-hazmat.rst b/docs/source/reference-hazmat.rst
@@ -165,13 +165,27 @@ All environments provide the following functions:
 
 .. function:: notify_closing(obj)
 
- Call this before closing a file descriptor or ``SOCKET`` handle
- that another task might be waiting on. This will cause any
- `wait_readable` or `wait_writable` calls to immediately raise
+ Call this before closing a file descriptor (on Unix) or socket (on
+ Windows). This will cause any `wait_readable` or `wait_writable`
+ calls on the given object to immediately wake up and raise
  `~trio.ClosedResourceError`.
 
  This doesn't actually close the object – you still have to do that
- yourself afterwards.
+ yourself afterwards. Also, you want to be careful to make sure no
+ new tasks start waiting on the object in between when you call this
+ and when it's actually closed. So to close something properly, you
+ usually want to do these steps in order:
+
+ 1. Explicitly mark the object as closed, so that any new attempts
+ to use it will abort before they start.
+ 2. Call `notify_closing` to wake up any already-existing users.
+ 3. Actually close the object.
+
+ It's also possible to do them in a different order if that's more
+ convenient, *but only if* you make sure not to have any checkpoints in
+ between the steps. This way they all happen in a single atomic
+ step, so other tasks won't be able to tell what order they happened
+ in anyway.
 
 
 Unix-specific API

diff --git a/newsfragments/52.feature.rst b/newsfragments/52.feature.rst
@@ -0,0 +1,19 @@
+On Windows, the `IOCP subsystem
+<https://docs.microsoft.com/en-us/windows/win32/fileio/i-o-completion-ports>`__
+is generally the best way to implement async I/O operations – but it's
+historically been weak at providing ``select``\-style readiness
+notifications, like `trio.hazmat.wait_readable` and
+`~trio.hazmat.wait_writable`. We aren't willing to give those up, so
+previously Trio's Windows backend used a hybrid of ``select`` + IOCP.
+This was complex, slow, and had `limited scalability
+<https://github.com/python-trio/trio/issues/3>`__.
+
+Fortunately, we found a way to implement ``wait_*`` with IOCP, so
+Trio's Windows backend has been completely rewritten, and now uses
+IOCP exclusively. As a user, the only difference you should notice is
+that Trio should now be faster on Windows, and can handle many more
+sockets. This also simplified the code internally, which should allow
+for more improvements in the future.
+
+However, this is somewhat experimental, so if you use Windows then
+please keep an eye out and let us know if you run into any problems!
diff --git a/notes-to-self/afd-lab.py b/notes-to-self/afd-lab.py
@@ -0,0 +1,176 @@
+# A little script to experiment with AFD polling.
+#
+# This cheats and uses a bunch of internal APIs. Don't follow its example. The
+# point is just to experiment with random junk that probably won't work, so we
+# can figure out what we actually do want to do internally.
+
+# Currently this demonstrates what seems to be a weird bug in the Windows
+# kernel. If you:
+#
+# 0. Set up a socket so that it's not writable.
+# 1. Submit a SEND poll operation.
+# 2. Submit a RECEIVE poll operation.
+# 3. Send some data through the socket, to trigger the RECEIVE.
+#
+# ...then the SEND poll operation completes with the RECEIVE flag set.
+#
+# (This bug is why our Windows backend jumps through hoops to avoid ever
+# issuing multiple polls simultaneously for the same socket.)
+#
+# This script's output on my machine:
+#
+# -- Iteration start --
+# Starting a poll for <AFDPollFlags.AFD_POLL_SEND: 4>
+# Starting a poll for <AFDPollFlags.AFD_POLL_RECEIVE: 1>
+# Sending another byte
+# Poll for <AFDPollFlags.AFD_POLL_SEND: 4>: got <AFDPollFlags.AFD_POLL_RECEIVE: 1>
+# Poll for <AFDPollFlags.AFD_POLL_RECEIVE: 1>: Cancelled()
+# -- Iteration start --
+# Starting a poll for <AFDPollFlags.AFD_POLL_SEND: 4>
+# Starting a poll for <AFDPollFlags.AFD_POLL_RECEIVE: 1>
+# Poll for <AFDPollFlags.AFD_POLL_RECEIVE: 1>: got <AFDPollFlags.AFD_POLL_RECEIVE: 1> Sending another byte
+# Poll for <AFDPollFlags.AFD_POLL_SEND: 4>: got <AFDPollFlags.AFD_POLL_RECEIVE: 1>
+#
+# So what we're seeing is:
+#
+# On the first iteration, where there's initially no data in the socket, the
+# SEND completes with the RECEIVE flag set, and the RECEIVE operation doesn't
+# return at all, until we cancel it.
+#
+# On the second iteration, there's already data sitting in the socket from the
+# last loop. This time, the RECEIVE returns immediately with the RECEIVE flag
+# set, which makes sense -- when starting a RECEIVE poll, it does an immediate
+# check to see if there's data already, and if so it does an early exit. But
+# the bizarre thing is, when we then send *another* byte of data, the SEND
+# operation wakes up with the RECEIVE flag set.
+#
+# Why is this bizarre? Let me count the ways:
+#
+# - The SEND operation should never return RECEIVE.
+#
+# - If it does insist on returning RECEIVE, it should do it immediately, since
+# there is already data to receive. But it doesn't.
+#
+# - And then when we send data into a socket that already has data in it, that
+# shouldn't have any effect at all! But instead it wakes up the SEND.
+#
+# - Also, the RECEIVE call did an early check for data and exited out
+# immediately, without going through the whole "register a callback to
+# be notified when data arrives" dance. So even if you do have some bug
+# in tracking which operations should be woken on which state transitions,
+# there's no reason this operation would even touch that tracking data. Yet,
+# if we take out the brief RECEIVE, then the SEND *doesn't* wake up.
+#
+# - Also, if I move the send() call up above the loop, so that there's already
+# data in the socket when we start our first iteration, then you would think
+# that would just make the first iteration act like it was the second
+# iteration. But it doesn't. Instead it makes all the weird behavior
+# disappear entirely.
+#
+# "What do we know … of the world and the universe about us? Our means of
+# receiving impressions are absurdly few, and our notions of surrounding
+# objects infinitely narrow. We see things only as we are constructed to see
+# them, and can gain no idea of their absolute nature. With five feeble senses
+# we pretend to comprehend the boundlessly complex cosmos, yet other beings
+# with wider, stronger, or different range of senses might not only see very
+# differently the things we see, but might see and study whole worlds of
+# matter, energy, and life which lie close at hand yet can never be detected
+# with the senses we have."
+
+import sys
+import os.path
+sys.path.insert(0, os.path.abspath(os.path.dirname(__file__) + r"\.."))
+
+import trio
+print(trio.__file__)
+import trio.testing
+import socket
+
+from trio._core._windows_cffi import (
+ ffi, kernel32, AFDPollFlags, IoControlCodes, ErrorCodes
+)
+from trio._core._io_windows import (
+ _get_base_socket, _afd_helper_handle, _check
+)
+
+class AFDLab:
+ def __init__(self):
+ self._afd = _afd_helper_handle()
+ trio.hazmat.register_with_iocp(self._afd)
+
+ async def afd_poll(self, sock, flags, *, exclusive=0):
+ print(f"Starting a poll for {flags!r}")
+ lpOverlapped = ffi.new("LPOVERLAPPED")
+ poll_info = ffi.new("AFD_POLL_INFO *")
+ poll_info.Timeout = 2**63 - 1 # INT64_MAX
+ poll_info.NumberOfHandles = 1
+ poll_info.Exclusive = exclusive
+ poll_info.Handles[0].Handle = _get_base_socket(sock)
+ poll_info.Handles[0].Status = 0
+ poll_info.Handles[0].Events = flags
+
+ try:
+ _check(
+ kernel32.DeviceIoControl(
+ self._afd,
+ IoControlCodes.IOCTL_AFD_POLL,
+ poll_info,
+ ffi.sizeof("AFD_POLL_INFO"),
+ poll_info,
+ ffi.sizeof("AFD_POLL_INFO"),
+ ffi.NULL,
+ lpOverlapped,
+ )
+ )
+ except OSError as exc:
+ if exc.winerror != ErrorCodes.ERROR_IO_PENDING: # pragma: no cover
+ raise
+
+ try:
+ await trio.hazmat.wait_overlapped(self._afd, lpOverlapped)
+ except:
+ print(f"Poll for {flags!r}: {sys.exc_info()[1]!r}")
+ raise
+ out_flags = AFDPollFlags(poll_info.Handles[0].Events)
+ print(f"Poll for {flags!r}: got {out_flags!r}")
+ return out_flags
+
+
+def fill_socket(sock):
+ try:
+ while True:
+ sock.send(b"x" * 65536)
+ except BlockingIOError:
+ pass
+
+
+async def main():
+ afdlab = AFDLab()
+
+ a, b = socket.socketpair()
+ a.setblocking(False)
+ b.setblocking(False)
+
+ fill_socket(a)
+
+ while True:
+ print("-- Iteration start --")
+ async with trio.open_nursery() as nursery:
+ nursery.start_soon(
+ afdlab.afd_poll,
+ a,
+ AFDPollFlags.AFD_POLL_SEND,
+ )
+ await trio.sleep(2)
+ nursery.start_soon(
+ afdlab.afd_poll,
+ a,
+ AFDPollFlags.AFD_POLL_RECEIVE,
+ )
+ await trio.sleep(2)
+ print("Sending another byte")
+ b.send(b"x")
+ await trio.sleep(2)
+ nursery.cancel_scope.cancel()
+
+trio.run(main)