Skip to content

gh-120754: Refactor I/O modules to stash whole stat result rather than individual members #123412

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 24 additions & 20 deletions Lib/_pyio.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,14 +242,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None,
buffering = -1
line_buffering = True
if buffering < 0:
buffering = DEFAULT_BUFFER_SIZE
try:
bs = os.fstat(raw.fileno()).st_blksize
except (OSError, AttributeError):
pass
else:
if bs > 1:
buffering = bs
buffering = raw._blksize
if buffering < 0:
raise ValueError("invalid buffering size")
if buffering == 0:
Expand Down Expand Up @@ -1565,19 +1558,15 @@ def __init__(self, file, mode='r', closefd=True, opener=None):
os.set_inheritable(fd, False)

self._closefd = closefd
fdfstat = os.fstat(fd)
self._stat_atopen = os.fstat(fd)
try:
if stat.S_ISDIR(fdfstat.st_mode):
if stat.S_ISDIR(self._stat_atopen.st_mode):
raise IsADirectoryError(errno.EISDIR,
os.strerror(errno.EISDIR), file)
except AttributeError:
# Ignore the AttributeError if stat.S_ISDIR or errno.EISDIR
# don't exist.
pass
self._blksize = getattr(fdfstat, 'st_blksize', 0)
if self._blksize <= 1:
self._blksize = DEFAULT_BUFFER_SIZE
self._estimated_size = fdfstat.st_size

if _setmode:
# don't translate newlines (\r\n <=> \n)
Expand Down Expand Up @@ -1623,6 +1612,17 @@ def __repr__(self):
return ('<%s name=%r mode=%r closefd=%r>' %
(class_name, name, self.mode, self._closefd))

@property
def _blksize(self):
if self._stat_atopen is None:
return DEFAULT_BUFFER_SIZE

blksize = getattr(self._stat_atopen, "st_blksize", 0)
# WASI sets blsize to 0
if not blksize:
return DEFAULT_BUFFER_SIZE
return blksize

def _checkReadable(self):
if not self._readable:
raise UnsupportedOperation('File not open for reading')
Expand Down Expand Up @@ -1655,16 +1655,20 @@ def readall(self):
"""
self._checkClosed()
self._checkReadable()
if self._estimated_size <= 0:
if self._stat_atopen is None or self._stat_atopen.st_size <= 0:
bufsize = DEFAULT_BUFFER_SIZE
else:
bufsize = self._estimated_size + 1
# In order to detect end of file, need a read() of at least 1
# byte which returns size 0. Oversize the buffer by 1 byte so the
# I/O can be completed with two read() calls (one for all data, one
# for EOF) without needing to resize the buffer.
bufsize = self._stat_atopen.st_size + 1

if self._estimated_size > 65536:
if self._stat_atopen.st_size > 65536:
try:
pos = os.lseek(self._fd, 0, SEEK_CUR)
if self._estimated_size >= pos:
bufsize = self._estimated_size - pos + 1
if self._stat_atopen.st_size >= pos:
bufsize = self._stat_atopen.st_size - pos + 1
except OSError:
pass

Expand Down Expand Up @@ -1742,7 +1746,7 @@ def truncate(self, size=None):
if size is None:
size = self.tell()
os.ftruncate(self._fd, size)
self._estimated_size = size
self._stat_atopen = None
return size

def close(self):
Expand Down
83 changes: 57 additions & 26 deletions Modules/_io/fileio.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,13 @@ typedef struct {
signed int seekable : 2; /* -1 means unknown */
unsigned int closefd : 1;
char finalizing;
unsigned int blksize;
Py_off_t estimated_size;
/* Stat result which was grabbed at file open, useful for optimizing common
File I/O patterns to be more efficient. This is only guidance / an
estimate, as it is subject to Time-Of-Check to Time-Of-Use (TOCTOU)
issues / bugs. Both the underlying file descriptor and file may be
modified outside of the fileio object / Python (ex. gh-90102, GH-121941,
gh-109523). */
struct _Py_stat_struct *stat_atopen;
PyObject *weakreflist;
PyObject *dict;
} fileio;
Expand Down Expand Up @@ -199,8 +204,7 @@ fileio_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
self->writable = 0;
self->appending = 0;
self->seekable = -1;
self->blksize = 0;
self->estimated_size = -1;
self->stat_atopen = NULL;
self->closefd = 1;
self->weakreflist = NULL;
}
Expand Down Expand Up @@ -256,7 +260,6 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
#elif !defined(MS_WINDOWS)
int *atomic_flag_works = NULL;
#endif
struct _Py_stat_struct fdfstat;
int fstat_result;
int async_err = 0;

Expand Down Expand Up @@ -454,9 +457,13 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
#endif
}

self->blksize = DEFAULT_BUFFER_SIZE;
self->stat_atopen = PyMem_New(struct _Py_stat_struct, 1);
if (self->stat_atopen == NULL) {
PyErr_NoMemory();
goto error;
}
Py_BEGIN_ALLOW_THREADS
fstat_result = _Py_fstat_noraise(self->fd, &fdfstat);
fstat_result = _Py_fstat_noraise(self->fd, self->stat_atopen);
Py_END_ALLOW_THREADS
if (fstat_result < 0) {
/* Tolerate fstat() errors other than EBADF. See Issue #25717, where
Expand All @@ -471,25 +478,21 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
#endif
goto error;
}

PyMem_Free(self->stat_atopen);
self->stat_atopen = NULL;
}
else {
#if defined(S_ISDIR) && defined(EISDIR)
/* On Unix, open will succeed for directories.
In Python, there should be no file objects referring to
directories, so we need a check. */
if (S_ISDIR(fdfstat.st_mode)) {
if (S_ISDIR(self->stat_atopen->st_mode)) {
errno = EISDIR;
PyErr_SetFromErrnoWithFilenameObject(PyExc_OSError, nameobj);
goto error;
}
#endif /* defined(S_ISDIR) */
#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
if (fdfstat.st_blksize > 1)
self->blksize = fdfstat.st_blksize;
#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */
if (fdfstat.st_size < PY_SSIZE_T_MAX) {
self->estimated_size = (Py_off_t)fdfstat.st_size;
}
}

#if defined(MS_WINDOWS) || defined(__CYGWIN__)
Expand Down Expand Up @@ -521,6 +524,10 @@ _io_FileIO___init___impl(fileio *self, PyObject *nameobj, const char *mode,
internal_close(self);
_PyErr_ChainExceptions1(exc);
}
if (self->stat_atopen != NULL) {
PyMem_Free(self->stat_atopen);
self->stat_atopen = NULL;
}

done:
#ifdef MS_WINDOWS
Expand Down Expand Up @@ -553,6 +560,10 @@ fileio_dealloc(fileio *self)
if (_PyIOBase_finalize((PyObject *) self) < 0)
return;
_PyObject_GC_UNTRACK(self);
if (self->stat_atopen != NULL) {
PyMem_Free(self->stat_atopen);
self->stat_atopen = NULL;
}
if (self->weakreflist != NULL)
PyObject_ClearWeakRefs((PyObject *) self);
(void)fileio_clear(self);
Expand Down Expand Up @@ -725,20 +736,27 @@ _io_FileIO_readall_impl(fileio *self)
return err_closed();
}

end = self->estimated_size;
if (self->stat_atopen != NULL && self->stat_atopen->st_size < _PY_READ_MAX) {
end = (Py_off_t)self->stat_atopen->st_size;
}
else {
end = -1;
}
if (end <= 0) {
/* Use a default size and resize as needed. */
bufsize = SMALLCHUNK;
}
else {
/* This is probably a real file, so we try to allocate a
buffer one byte larger than the rest of the file. If the
calculation is right then we should get EOF without having
to enlarge the buffer. */
/* This is probably a real file. */
if (end > _PY_READ_MAX - 1) {
bufsize = _PY_READ_MAX;
}
else {
/* In order to detect end of file, need a read() of at
least 1 byte which returns size 0. Oversize the buffer
by 1 byte so the I/O can be completed with two read()
calls (one for all data, one for EOF) without needing
to resize the buffer. */
bufsize = (size_t)end + 1;
}

Expand Down Expand Up @@ -1094,11 +1112,13 @@ _io_FileIO_truncate_impl(fileio *self, PyTypeObject *cls, PyObject *posobj)
return NULL;
}

/* Sometimes a large file is truncated. While estimated_size is used as a
estimate, that it is much larger than the actual size can result in a
significant over allocation and sometimes a MemoryError / running out of
memory. */
self->estimated_size = pos;
/* Since the file was truncated, its size at open is no longer accurate
as an estimate. Clear out the stat result, and rely on dynamic resize
code if a readall is requested. */
if (self->stat_atopen != NULL) {
PyMem_Free(self->stat_atopen);
self->stat_atopen = NULL;
}

return posobj;
}
Expand Down Expand Up @@ -1229,16 +1249,27 @@ get_mode(fileio *self, void *closure)
return PyUnicode_FromString(mode_string(self));
}

static PyObject *
get_blksize(fileio *self, void *closure)
{
#ifdef HAVE_STRUCT_STAT_ST_BLKSIZE
if (self->stat_atopen != NULL && self->stat_atopen->st_blksize > 1) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do wonder how realistic the st_blksize values, when available, are for performance purposes, I guess we'll find out.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR should not change the buffer size, does it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#117151 (comment) investigated st_blksize a bit previously. This PR I tried not to change buffer size at all / just change how it is accessed.

Have with the refactors + optimizations been watching for new issues. Are finding some as people test main (ex. gh-113977 which I wrote a primary fix for #122101, and have more fix ideas on top of the stat_atopen changes)

return PyLong_FromLong(self->stat_atopen->st_blksize);
}
#endif /* HAVE_STRUCT_STAT_ST_BLKSIZE */
return PyLong_FromLong(DEFAULT_BUFFER_SIZE);
}

static PyGetSetDef fileio_getsetlist[] = {
{"closed", (getter)get_closed, NULL, "True if the file is closed"},
{"closefd", (getter)get_closefd, NULL,
"True if the file descriptor will be closed by close()."},
{"mode", (getter)get_mode, NULL, "String giving the file mode"},
{"_blksize", (getter)get_blksize, NULL, "Stat st_blksize if available"},
{NULL},
};

static PyMemberDef fileio_members[] = {
{"_blksize", Py_T_UINT, offsetof(fileio, blksize), 0},
{"_finalizing", Py_T_BOOL, offsetof(fileio, finalizing), 0},
{"__weaklistoffset__", Py_T_PYSSIZET, offsetof(fileio, weakreflist), Py_READONLY},
{"__dictoffset__", Py_T_PYSSIZET, offsetof(fileio, dict), Py_READONLY},
Expand Down
Loading