diff --git a/pandas/io/common.py b/pandas/io/common.py index a01011cd7d4e4..2ae996acbb079 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -3,6 +3,7 @@ import bz2 import codecs import csv +import errno import gzip from io import BufferedIOBase, BytesIO import mmap @@ -219,6 +220,9 @@ def get_filepath_or_buffer( filepath_or_buffer = _stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer): + if " " in filepath_or_buffer: + # GH#17918 + raise ValueError("URL must be quotes before passing to read_* function") req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -243,6 +247,7 @@ def get_filepath_or_buffer( ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): + validate_local_path(filepath_or_buffer, mode) return _expand_user(filepath_or_buffer), None, compression, False if not is_file_like(filepath_or_buffer): @@ -252,6 +257,31 @@ def get_filepath_or_buffer( return filepath_or_buffer, None, compression, False +def validate_local_path(path: str, mode: Optional[str]): + """ + Ensure we have consistent error messages for non-existent files. + + Parameters + ---------- + path : str + mode : str or None + """ + if mode is None: + # Nothing we can do + return + + if mode.startswith("r"): + # We only need read permissions, but the file must exist + if not os.path.exists(path): + raise FileNotFoundError(errno.ENOENT, f"File {path} does not exist", path) + if not os.access(path, os.R_OK): + raise OSError(errno.EACCES, f"Insufficient permissions to read {path}") + + elif mode.startswith(("a", "w")): + pass + # Figure this out later + + def file_path_to_url(path: str) -> str: """ converts an absolute native path to a FILE URL. @@ -420,10 +450,14 @@ def _get_handle( # Convert pathlib.Path/py.path.local or string path_or_buf = _stringify_path(path_or_buf) + is_path = isinstance(path_or_buf, str) compression, compression_args = _get_compression_method(compression) if is_path: + if not is_s3_url(path_or_buf) and not is_gcs_url(path_or_buf): + # TODO: better way of checking for local path + validate_local_path(path_or_buf, mode=mode) compression = _infer_compression(path_or_buf, compression) if compression: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 81d3d46f78bdb..9b887b7e40a10 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -20,6 +20,7 @@ _validate_header_arg, get_filepath_or_buffer, urlopen, + validate_local_path, ) from pandas.io.excel._util import ( _fill_mi_header, @@ -351,6 +352,7 @@ def __init__(self, filepath_or_buffer): filepath_or_buffer.seek(0) self.book = self.load_workbook(filepath_or_buffer) elif isinstance(filepath_or_buffer, str): + validate_local_path(filepath_or_buffer, "r") self.book = self.load_workbook(filepath_or_buffer) else: raise ValueError( diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 6cb811bb97755..9ddc6af2859cd 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -24,6 +24,7 @@ _infer_compression, _stringify_path, get_filepath_or_buffer, + validate_local_path, ) from pandas.io.formats.printing import pprint_thing from pandas.io.parsers import _validate_integer @@ -59,6 +60,8 @@ def to_json( ) path_or_buf = _stringify_path(path_or_buf) + validate_local_path(path_or_buf, "w") + if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a887a537a2201..83510db86b124 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -69,6 +69,7 @@ _validate_header_arg, get_filepath_or_buffer, is_file_like, + validate_local_path, ) from pandas.io.date_converters import generic_parser @@ -1907,6 +1908,8 @@ def __init__(self, src, **kwds): self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"]) kwds["usecols"] = self.usecols + if isinstance(src, str): + validate_local_path(src, "r") self._reader = parsers.TextReader(src, **kwds) self.unnamed_cols = self._reader.unnamed_cols diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 6bd3532d538c7..d32988186fb67 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -1,7 +1,7 @@ """ Read SAS sas7bdat or xport files. """ -from pandas.io.common import _stringify_path +from pandas.io.common import _stringify_path, validate_local_path def read_sas( @@ -55,6 +55,7 @@ def read_sas( filepath_or_buffer = _stringify_path(filepath_or_buffer) if not isinstance(filepath_or_buffer, str): raise ValueError(buffer_error_msg) + validate_local_path(filepath_or_buffer, "r") fname = filepath_or_buffer.lower() if fname.endswith(".xpt"): format = "xport" diff --git a/pandas/io/stata.py b/pandas/io/stata.py index dbe64e4c0f06d..e426a5c720971 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1053,7 +1053,9 @@ def __init__( self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = _stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf) + path_or_buf, encoding, _, should_close = get_filepath_or_buffer( + path_or_buf, mode="r" + ) if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index a15eac89ecedb..196a9d3f9f24e 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -1,6 +1,7 @@ """ Tests for the pandas.io.common functionalities """ +import errno from io import StringIO import mmap import os @@ -360,3 +361,65 @@ def test_unknown_engine(self): df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") + + +@pytest.mark.parametrize( + "reader", + [ + pd.read_csv, + pd.read_table, + pd.read_fwf, + pytest.param(pd.read_excel, marks=[td.skip_if_no("xlrd")]), + pytest.param( + pd.read_json, + marks=[ + pytest.mark.xfail( + reason=( + "Needs to distinguish between a path to read " + "vs a string to parse" + ) + ) + ], + ), + pd.read_pickle, + pd.read_stata, + pd.read_sas, + ], +) +def test_errno_set_nonexistent(reader): + # GH#13872 ensure that errno is set on file not found error + # This also serves as a check that we raise the correct type of error + try: + reader("nonexistent_name") + except FileNotFoundError as err: + assert err.errno == errno.ENOENT + + # GH#29125 consistent error messages across read_* functions + assert "File nonexistent_name does not exist: 'nonexistent_name'" in str(err) + + +@pytest.mark.parametrize( + "reader", + [ + pd.read_csv, + pd.read_table, + pd.read_fwf, + pytest.param(pd.read_excel, marks=[td.skip_if_no("xlrd")]), + pd.read_json, + pd.read_pickle, + pd.read_stata, + pd.read_sas, + ], +) +@pytest.mark.skip(is_platform_windows(), reason="permissions work differently") +def test_errno_set_permissions(reader): + # GH#23784 + # make sure we get permiissions error when we try to read without permission + with tm.ensure_clean() as path: + os.chmod(path, 0o000) + try: + reader(path) + except OSError as err: + assert err.errno == errno.EACCES + finally: + os.chmod(path, 0o777)