Skip to content
34 changes: 34 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import bz2
import codecs
import csv
import errno
import gzip
from io import BufferedIOBase, BytesIO
import mmap
Expand Down Expand Up @@ -219,6 +220,9 @@ def get_filepath_or_buffer(
filepath_or_buffer = _stringify_path(filepath_or_buffer)

if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer):
if " " in filepath_or_buffer:
# GH#17918
raise ValueError("URL must be quotes before passing to read_* function")
req = urlopen(filepath_or_buffer)
content_encoding = req.headers.get("Content-Encoding", None)
if content_encoding == "gzip":
Expand All @@ -243,6 +247,7 @@ def get_filepath_or_buffer(
)

if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
validate_local_path(filepath_or_buffer, mode)
return _expand_user(filepath_or_buffer), None, compression, False

if not is_file_like(filepath_or_buffer):
Expand All @@ -252,6 +257,31 @@ def get_filepath_or_buffer(
return filepath_or_buffer, None, compression, False


def validate_local_path(path: str, mode: Optional[str]):
"""
Ensure we have consistent error messages for non-existent files.

Parameters
----------
path : str
mode : str or None
"""
if mode is None:
# Nothing we can do
return

if mode.startswith("r"):
# We only need read permissions, but the file must exist
if not os.path.exists(path):
raise FileNotFoundError(errno.ENOENT, f"File {path} does not exist", path)
if not os.access(path, os.R_OK):
raise OSError(errno.EACCES, f"Insufficient permissions to read {path}")

elif mode.startswith(("a", "w")):
pass
# Figure this out later


def file_path_to_url(path: str) -> str:
"""
converts an absolute native path to a FILE URL.
Expand Down Expand Up @@ -420,10 +450,14 @@ def _get_handle(

# Convert pathlib.Path/py.path.local or string
path_or_buf = _stringify_path(path_or_buf)

is_path = isinstance(path_or_buf, str)

compression, compression_args = _get_compression_method(compression)
if is_path:
if not is_s3_url(path_or_buf) and not is_gcs_url(path_or_buf):
# TODO: better way of checking for local path
validate_local_path(path_or_buf, mode=mode)
compression = _infer_compression(path_or_buf, compression)

if compression:
Expand Down
2 changes: 2 additions & 0 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
_validate_header_arg,
get_filepath_or_buffer,
urlopen,
validate_local_path,
)
from pandas.io.excel._util import (
_fill_mi_header,
Expand Down Expand Up @@ -351,6 +352,7 @@ def __init__(self, filepath_or_buffer):
filepath_or_buffer.seek(0)
self.book = self.load_workbook(filepath_or_buffer)
elif isinstance(filepath_or_buffer, str):
validate_local_path(filepath_or_buffer, "r")
self.book = self.load_workbook(filepath_or_buffer)
else:
raise ValueError(
Expand Down
3 changes: 3 additions & 0 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
_infer_compression,
_stringify_path,
get_filepath_or_buffer,
validate_local_path,
)
from pandas.io.formats.printing import pprint_thing
from pandas.io.parsers import _validate_integer
Expand Down Expand Up @@ -59,6 +60,8 @@ def to_json(
)

path_or_buf = _stringify_path(path_or_buf)
validate_local_path(path_or_buf, "w")

if lines and orient != "records":
raise ValueError("'lines' keyword only valid when 'orient' is records")

Expand Down
3 changes: 3 additions & 0 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
_validate_header_arg,
get_filepath_or_buffer,
is_file_like,
validate_local_path,
)
from pandas.io.date_converters import generic_parser

Expand Down Expand Up @@ -1907,6 +1908,8 @@ def __init__(self, src, **kwds):
self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
kwds["usecols"] = self.usecols

if isinstance(src, str):
validate_local_path(src, "r")
self._reader = parsers.TextReader(src, **kwds)
self.unnamed_cols = self._reader.unnamed_cols

Expand Down
3 changes: 2 additions & 1 deletion pandas/io/sas/sasreader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
Read SAS sas7bdat or xport files.
"""
from pandas.io.common import _stringify_path
from pandas.io.common import _stringify_path, validate_local_path


def read_sas(
Expand Down Expand Up @@ -55,6 +55,7 @@ def read_sas(
filepath_or_buffer = _stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, str):
raise ValueError(buffer_error_msg)
validate_local_path(filepath_or_buffer, "r")
fname = filepath_or_buffer.lower()
if fname.endswith(".xpt"):
format = "xport"
Expand Down
4 changes: 3 additions & 1 deletion pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1053,7 +1053,9 @@ def __init__(
self._native_byteorder = _set_endianness(sys.byteorder)
path_or_buf = _stringify_path(path_or_buf)
if isinstance(path_or_buf, str):
path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf)
path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
path_or_buf, mode="r"
)

if isinstance(path_or_buf, (str, bytes)):
self.path_or_buf = open(path_or_buf, "rb")
Expand Down
63 changes: 63 additions & 0 deletions pandas/tests/io/test_common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Tests for the pandas.io.common functionalities
"""
import errno
from io import StringIO
import mmap
import os
Expand Down Expand Up @@ -360,3 +361,65 @@ def test_unknown_engine(self):
df.to_csv(path)
with pytest.raises(ValueError, match="Unknown engine"):
pd.read_csv(path, engine="pyt")


@pytest.mark.parametrize(
"reader",
[
pd.read_csv,
pd.read_table,
pd.read_fwf,
pytest.param(pd.read_excel, marks=[td.skip_if_no("xlrd")]),
pytest.param(
pd.read_json,
marks=[
pytest.mark.xfail(
reason=(
"Needs to distinguish between a path to read "
"vs a string to parse"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd any thoughts on this? I think I saw a semi-recent discussion about trying to guess whether a string is supposed to be a path or actual content.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea this was a discussion with @datapythonista and @gfyoung . I don't have a strong point of view in any direction but would be against jumping through a lot of hoops to disambiguate.

I guess if we cared to disambiguate could probably just look for [, {, " or any leading number (ignoring whitespace) to start as those are the requirements for JSON

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would leave the JSON disambiguation alone for now. I'm still not really that inclined to change at the moment.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking through old issues, there are a lot that revolve around disambiguating paths vs content, e.g. #5924 for pickle. (one clever suggestion was to have pd.read_pickles, pd.read_jsons, etc as analogues to the stdlib json.loads)

This doesn't need to be resolved for this PR, but I think we're going to need to have a decision about whether/when/how to guess if a string is a path or not.

)
)
],
),
pd.read_pickle,
pd.read_stata,
pd.read_sas,
],
)
def test_errno_set_nonexistent(reader):
# GH#13872 ensure that errno is set on file not found error
# This also serves as a check that we raise the correct type of error
try:
reader("nonexistent_name")
except FileNotFoundError as err:
assert err.errno == errno.ENOENT

# GH#29125 consistent error messages across read_* functions
assert "File nonexistent_name does not exist: 'nonexistent_name'" in str(err)


@pytest.mark.parametrize(
"reader",
[
pd.read_csv,
pd.read_table,
pd.read_fwf,
pytest.param(pd.read_excel, marks=[td.skip_if_no("xlrd")]),
pd.read_json,
pd.read_pickle,
pd.read_stata,
pd.read_sas,
],
)
@pytest.mark.skip(is_platform_windows(), reason="permissions work differently")
def test_errno_set_permissions(reader):
# GH#23784
# make sure we get permiissions error when we try to read without permission
with tm.ensure_clean() as path:
os.chmod(path, 0o000)
try:
reader(path)
except OSError as err:
assert err.errno == errno.EACCES
finally:
os.chmod(path, 0o777)