From 50389df0a5f34d5d8ec0017ca9f4e341f0f89761 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Wed, 5 Feb 2025 12:52:46 -0800 Subject: [PATCH 1/5] gh-12005: Align FileIO.readall between _pyio and _io Utilize `bytearray.resize()` and `os.readinto()` to reduce copies and match behavior of `_io.FileIO.readall()`. There is still an extra copy which means twice the memory required compared to FileIO because there isn't a zero-copy path from `bytearray` -> `bytes` currently. On my system reading a 2GB file `./python -m test -M8g -uall test_largefile -m test.test_largefile.PyLargeFileTest.test_large_read -v` Goes from ~2.7 seconds -> ~2.2 seconds --- Lib/_pyio.py | 29 ++++++++++++------- ...-02-05-13-19-15.gh-issue-129005.Sb69L_.rst | 2 ++ 2 files changed, 21 insertions(+), 10 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 023478aa78c6a0..3c89f6becb6c69 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1674,22 +1674,31 @@ def readall(self): except OSError: pass - result = bytearray() + result = bytearray(bufsize) + bytes_read = 0 while True: - if len(result) >= bufsize: - bufsize = len(result) - bufsize += max(bufsize, DEFAULT_BUFFER_SIZE) - n = bufsize - len(result) + if bytes_read >= bufsize: + # Parallels _io/fileio.c new_buffersize + if bufsize > 65536: + addend = bufsize >> 3 + else: + addend = 256 + bufsize + if addend < DEFAULT_BUFFER_SIZE: + addend = DEFAULT_BUFFER_SIZE + bufsize += addend + result.resize(bufsize) + + assert bufsize - bytes_read > 0, "Should always try and read at least one byte" try: - chunk = os.read(self._fd, n) + n = os.readinto(self._fd, memoryview(result)[bytes_read:]) except BlockingIOError: - if result: + if bytes_read: break return None - if not chunk: # reached the end of the file + if n == 0: # Reached the end of the file break - result += chunk - + bytes_read += n + result.resize(bytes_read) return bytes(result) def readinto(self, buffer): diff --git a/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst new file mode 100644 index 00000000000000..236d7766b92cc6 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-02-05-13-19-15.gh-issue-129005.Sb69L_.rst @@ -0,0 +1,2 @@ +``_pyio.FileIO.readall()`` now allocates, resizes, and fills a data buffer +using the same algorithm ``_io.FileIO.readall()`` uses. From 4520ecc0d741ee44d297877e7fd4808f15d6241b Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Wed, 5 Feb 2025 14:32:26 -0800 Subject: [PATCH 2/5] Update Lib/_pyio.py Co-authored-by: Victor Stinner --- Lib/_pyio.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 3c89f6becb6c69..6a558e3ea0c9ca 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1688,7 +1688,7 @@ def readall(self): bufsize += addend result.resize(bufsize) - assert bufsize - bytes_read > 0, "Should always try and read at least one byte" + assert len(result) - bytes_read >= 1, "Must read at least one byte" try: n = os.readinto(self._fd, memoryview(result)[bytes_read:]) except BlockingIOError: From 6c3ac57d2c0c4f1b29e416e9a0f61d9a72d8a0a7 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Thu, 6 Feb 2025 13:57:47 -0800 Subject: [PATCH 3/5] Use len(result) rather than bufsize, _new_buffersize, make in terms of bytes_read --- Lib/_pyio.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 6a558e3ea0c9ca..315d0406748102 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1456,6 +1456,17 @@ def write(self, b): return BufferedWriter.write(self, b) +def _new_buffersize(bytes_read): + # Parallels _io/fileio.c new_buffersize + if bytes_read > 65536: + addend = bytes_read >> 3 + else: + addend = 256 + bytes_read + if addend < DEFAULT_BUFFER_SIZE: + addend = DEFAULT_BUFFER_SIZE + return bytes_read + addend + + class FileIO(RawIOBase): _fd = -1 _created = False @@ -1677,16 +1688,8 @@ def readall(self): result = bytearray(bufsize) bytes_read = 0 while True: - if bytes_read >= bufsize: - # Parallels _io/fileio.c new_buffersize - if bufsize > 65536: - addend = bufsize >> 3 - else: - addend = 256 + bufsize - if addend < DEFAULT_BUFFER_SIZE: - addend = DEFAULT_BUFFER_SIZE - bufsize += addend - result.resize(bufsize) + if bytes_read >= len(result): + result.resize(_new_buffersize(bytes_read)) assert len(result) - bytes_read >= 1, "Must read at least one byte" try: From b50fb662024fd9b3e12ecf0b4ae34d0abbb7ee1c Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Thu, 6 Feb 2025 20:03:53 -0800 Subject: [PATCH 4/5] Simplify control structure --- Lib/_pyio.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index ca21b1d432d437..798fe55670ac2e 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1685,23 +1685,21 @@ def readall(self): result = bytearray(bufsize) bytes_read = 0 - while True: - if bytes_read >= len(result): - result.resize(_new_buffersize(bytes_read)) - - assert len(result) - bytes_read >= 1, "Must read at least one byte" - try: - n = os.readinto(self._fd, memoryview(result)[bytes_read:]) - except BlockingIOError: - if bytes_read: - break + try: + while n := os.readinto(self._fd, memoryview(result)[bytes_read:]): + bytes_read += n + if bytes_read >= len(result): + result.resize(_new_buffersize(bytes_read)) + except BlockingIOError: + if not bytes_read: return None - if n == 0: # Reached the end of the file - break - bytes_read += n + + assert len(result) - bytes_read >= 1, \ + "os.readinto buffer size 0 will result in erroneous EOF / returns 0" result.resize(bytes_read) return bytes(result) + def readinto(self, buffer): """Same as RawIOBase.readinto().""" self._checkClosed() From 4da4e3127893ae503c98d412cccbdcfe9f8f8086 Mon Sep 17 00:00:00 2001 From: Cody Maloney Date: Thu, 6 Feb 2025 21:25:42 -0800 Subject: [PATCH 5/5] Fix whitespace --- Lib/_pyio.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Lib/_pyio.py b/Lib/_pyio.py index 798fe55670ac2e..f7370dff19efc8 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -1699,7 +1699,6 @@ def readall(self): result.resize(bytes_read) return bytes(result) - def readinto(self, buffer): """Same as RawIOBase.readinto().""" self._checkClosed()