Skip to content

WIP/ERR: raise consistent errors for non-existent files #30264

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
34 changes: 34 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import bz2
import codecs
import csv
import errno
import gzip
from io import BufferedIOBase, BytesIO
import mmap
Expand Down Expand Up @@ -219,6 +220,9 @@ def get_filepath_or_buffer(
filepath_or_buffer = _stringify_path(filepath_or_buffer)

if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer):
if " " in filepath_or_buffer:
# GH#17918
raise ValueError("URL must be quotes before passing to read_* function")
req = urlopen(filepath_or_buffer)
content_encoding = req.headers.get("Content-Encoding", None)
if content_encoding == "gzip":
Expand All @@ -243,6 +247,7 @@ def get_filepath_or_buffer(
)

if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
validate_local_path(filepath_or_buffer, mode)
return _expand_user(filepath_or_buffer), None, compression, False

if not is_file_like(filepath_or_buffer):
Expand All @@ -252,6 +257,31 @@ def get_filepath_or_buffer(
return filepath_or_buffer, None, compression, False


def validate_local_path(path: str, mode: Optional[str]):
"""
Ensure we have consistent error messages for non-existent files.

Parameters
----------
path : str
mode : str or None
"""
if mode is None:
# Nothing we can do
return

if mode.startswith("r"):
# We only need read permissions, but the file must exist
if not os.path.exists(path):
raise FileNotFoundError(errno.ENOENT, f"File {path} does not exist", path)
if not os.access(path, os.R_OK):
raise OSError(errno.EACCES, f"Insufficient permissions to read {path}")

elif mode.startswith(("a", "w")):
pass
# Figure this out later


def file_path_to_url(path: str) -> str:
"""
converts an absolute native path to a FILE URL.
Expand Down Expand Up @@ -420,10 +450,14 @@ def _get_handle(

# Convert pathlib.Path/py.path.local or string
path_or_buf = _stringify_path(path_or_buf)

is_path = isinstance(path_or_buf, str)

compression, compression_args = _get_compression_method(compression)
if is_path:
if not is_s3_url(path_or_buf) and not is_gcs_url(path_or_buf):
# TODO: better way of checking for local path
validate_local_path(path_or_buf, mode=mode)
compression = _infer_compression(path_or_buf, compression)

if compression:
Expand Down
2 changes: 2 additions & 0 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
_validate_header_arg,
get_filepath_or_buffer,
urlopen,
validate_local_path,
)
from pandas.io.excel._util import (
_fill_mi_header,
Expand Down Expand Up @@ -351,6 +352,7 @@ def __init__(self, filepath_or_buffer):
filepath_or_buffer.seek(0)
self.book = self.load_workbook(filepath_or_buffer)
elif isinstance(filepath_or_buffer, str):
validate_local_path(filepath_or_buffer, "r")
self.book = self.load_workbook(filepath_or_buffer)
else:
raise ValueError(
Expand Down
3 changes: 3 additions & 0 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
_infer_compression,
_stringify_path,
get_filepath_or_buffer,
validate_local_path,
)
from pandas.io.formats.printing import pprint_thing
from pandas.io.parsers import _validate_integer
Expand Down Expand Up @@ -59,6 +60,8 @@ def to_json(
)

path_or_buf = _stringify_path(path_or_buf)
validate_local_path(path_or_buf, "w")

if lines and orient != "records":
raise ValueError("'lines' keyword only valid when 'orient' is records")

Expand Down
3 changes: 3 additions & 0 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
_validate_header_arg,
get_filepath_or_buffer,
is_file_like,
validate_local_path,
)
from pandas.io.date_converters import generic_parser

Expand Down Expand Up @@ -1907,6 +1908,8 @@ def __init__(self, src, **kwds):
self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
kwds["usecols"] = self.usecols

if isinstance(src, str):
validate_local_path(src, "r")
self._reader = parsers.TextReader(src, **kwds)
self.unnamed_cols = self._reader.unnamed_cols

Expand Down
3 changes: 2 additions & 1 deletion pandas/io/sas/sasreader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Read SAS sas7bdat or xport files.
"""
from pandas.io.common import _stringify_path
from pandas.io.common import _stringify_path, validate_local_path


def read_sas(
Expand Down Expand Up @@ -55,6 +55,7 @@ def read_sas(
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, str):
raise ValueError(buffer_error_msg)
validate_local_path(filepath_or_buffer, "r")
fname = filepath_or_buffer.lower()
if fname.endswith(".xpt"):
format = "xport"
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1053,7 +1053,9 @@ def __init__(
self._native_byteorder = _set_endianness(sys.byteorder)
path_or_buf = _stringify_path(path_or_buf)
if isinstance(path_or_buf, str):
path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf)
path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
path_or_buf, mode="r"
)

if isinstance(path_or_buf, (str, bytes)):
self.path_or_buf = open(path_or_buf, "rb")
Expand Down
63 changes: 63 additions & 0 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Tests for the pandas.io.common functionalities
"""
import errno
from io import StringIO
import mmap
import os
Expand Down Expand Up @@ -360,3 +361,65 @@ def test_unknown_engine(self):
df.to_csv(path)
with pytest.raises(ValueError, match="Unknown engine"):
pd.read_csv(path, engine="pyt")


@pytest.mark.parametrize(
"reader",
[
pd.read_csv,
pd.read_table,
pd.read_fwf,
pytest.param(pd.read_excel, marks=[td.skip_if_no("xlrd")]),
pytest.param(
pd.read_json,
marks=[
pytest.mark.xfail(
reason=(
"Needs to distinguish between a path to read "
"vs a string to parse"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd any thoughts on this? I think I saw a semi-recent discussion about trying to guess whether a string is supposed to be a path or actual content.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea this was a discussion with @datapythonista and @gfyoung . I don't have a strong point of view in any direction but would be against jumping through a lot of hoops to disambiguate.

I guess if we cared to disambiguate could probably just look for [, {, " or any leading number (ignoring whitespace) to start as those are the requirements for JSON

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would leave the JSON disambiguation alone for now. I'm still not really that inclined to change at the moment.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking through old issues, there are a lot that revolve around disambiguating paths vs content, e.g. #5924 for pickle. (one clever suggestion was to have pd.read_pickles, pd.read_jsons, etc as analogues to the stdlib json.loads)

This doesn't need to be resolved for this PR, but I think we're going to need to have a decision about whether/when/how to guess if a string is a path or not.

)
)
],
),
pd.read_pickle,
pd.read_stata,
pd.read_sas,
],
)
def test_errno_set_nonexistent(reader):
# GH#13872 ensure that errno is set on file not found error
# This also serves as a check that we raise the correct type of error
try:
reader("nonexistent_name")
except FileNotFoundError as err:
assert err.errno == errno.ENOENT

# GH#29125 consistent error messages across read_* functions
assert "File nonexistent_name does not exist: 'nonexistent_name'" in str(err)


@pytest.mark.parametrize(
"reader",
[
pd.read_csv,
pd.read_table,
pd.read_fwf,
pytest.param(pd.read_excel, marks=[td.skip_if_no("xlrd")]),
pd.read_json,
pd.read_pickle,
pd.read_stata,
pd.read_sas,
],
)
@pytest.mark.skip(is_platform_windows(), reason="permissions work differently")
def test_errno_set_permissions(reader):
# GH#23784
# make sure we get permiissions error when we try to read without permission
with tm.ensure_clean() as path:
os.chmod(path, 0o000)
try:
reader(path)
except OSError as err:
assert err.errno == errno.EACCES
finally:
os.chmod(path, 0o777)