pandas-dev · jbrockmendel · Dec 13, 2019 · Dec 14, 2019 · Dec 16, 2019 · Dec 16, 2019
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -3,6 +3,7 @@
 import bz2
 import codecs
 import csv
+import errno
 import gzip
 from io import BufferedIOBase, BytesIO
 import mmap
@@ -219,6 +220,9 @@ def get_filepath_or_buffer(
     filepath_or_buffer = _stringify_path(filepath_or_buffer)
 
     if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer):
+        if " " in filepath_or_buffer:
+            # GH#17918
+            raise ValueError("URL must be quotes before passing to read_* function")
         req = urlopen(filepath_or_buffer)
         content_encoding = req.headers.get("Content-Encoding", None)
         if content_encoding == "gzip":
@@ -243,6 +247,7 @@ def get_filepath_or_buffer(
         )
 
     if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
+        validate_local_path(filepath_or_buffer, mode)
         return _expand_user(filepath_or_buffer), None, compression, False
 
     if not is_file_like(filepath_or_buffer):
@@ -252,6 +257,31 @@ def get_filepath_or_buffer(
     return filepath_or_buffer, None, compression, False
 
 
+def validate_local_path(path: str, mode: Optional[str]):
+    """
+    Ensure we have consistent error messages for non-existent files.
+
+    Parameters
+    ----------
+    path : str
+    mode : str or None
+    """
+    if mode is None:
+        # Nothing we can do
+        return
+
+    if mode.startswith("r"):
+        # We only need read permissions, but the file must exist
+        if not os.path.exists(path):
+            raise FileNotFoundError(errno.ENOENT, f"File {path} does not exist", path)
+        if not os.access(path, os.R_OK):
+            raise OSError(errno.EACCES, f"Insufficient permissions to read {path}")
+
+    elif mode.startswith(("a", "w")):
+        pass
+        # Figure this out later
+
+
 def file_path_to_url(path: str) -> str:
     """
     converts an absolute native path to a FILE URL.
@@ -420,10 +450,14 @@ def _get_handle(
 
     # Convert pathlib.Path/py.path.local or string
     path_or_buf = _stringify_path(path_or_buf)
+
     is_path = isinstance(path_or_buf, str)
 
     compression, compression_args = _get_compression_method(compression)
     if is_path:
+        if not is_s3_url(path_or_buf) and not is_gcs_url(path_or_buf):
+            # TODO: better way of checking for local path
+            validate_local_path(path_or_buf, mode=mode)
         compression = _infer_compression(path_or_buf, compression)
 
     if compression:

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -20,6 +20,7 @@
     _validate_header_arg,
     get_filepath_or_buffer,
     urlopen,
+    validate_local_path,
 )
 from pandas.io.excel._util import (
     _fill_mi_header,
@@ -351,6 +352,7 @@ def __init__(self, filepath_or_buffer):
             filepath_or_buffer.seek(0)
             self.book = self.load_workbook(filepath_or_buffer)
         elif isinstance(filepath_or_buffer, str):
+            validate_local_path(filepath_or_buffer, "r")
             self.book = self.load_workbook(filepath_or_buffer)
         else:
             raise ValueError(

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -24,6 +24,7 @@
     _infer_compression,
     _stringify_path,
     get_filepath_or_buffer,
+    validate_local_path,
 )
 from pandas.io.formats.printing import pprint_thing
 from pandas.io.parsers import _validate_integer
@@ -59,6 +60,8 @@ def to_json(
         )
 
     path_or_buf = _stringify_path(path_or_buf)
+    validate_local_path(path_or_buf, "w")
+
     if lines and orient != "records":
         raise ValueError("'lines' keyword only valid when 'orient' is records")
 

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -69,6 +69,7 @@
     _validate_header_arg,
     get_filepath_or_buffer,
     is_file_like,
+    validate_local_path,
 )
 from pandas.io.date_converters import generic_parser
 
@@ -1907,6 +1908,8 @@ def __init__(self, src, **kwds):
         self.usecols, self.usecols_dtype = _validate_usecols_arg(kwds["usecols"])
         kwds["usecols"] = self.usecols
 
+        if isinstance(src, str):
+            validate_local_path(src, "r")
         self._reader = parsers.TextReader(src, **kwds)
         self.unnamed_cols = self._reader.unnamed_cols
 

diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py
@@ -1,7 +1,7 @@
 """
 Read SAS sas7bdat or xport files.
 """
-from pandas.io.common import _stringify_path
+from pandas.io.common import _stringify_path, validate_local_path
 
 
 def read_sas(
@@ -55,6 +55,7 @@ def read_sas(
         filepath_or_buffer = _stringify_path(filepath_or_buffer)
         if not isinstance(filepath_or_buffer, str):
             raise ValueError(buffer_error_msg)
+        validate_local_path(filepath_or_buffer, "r")
         fname = filepath_or_buffer.lower()
         if fname.endswith(".xpt"):
             format = "xport"

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1053,7 +1053,9 @@ def __init__(
         self._native_byteorder = _set_endianness(sys.byteorder)
         path_or_buf = _stringify_path(path_or_buf)
         if isinstance(path_or_buf, str):
-            path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf)
+            path_or_buf, encoding, _, should_close = get_filepath_or_buffer(
+                path_or_buf, mode="r"
+            )
 
         if isinstance(path_or_buf, (str, bytes)):
             self.path_or_buf = open(path_or_buf, "rb")

diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -1,6 +1,7 @@
 """
 Tests for the pandas.io.common functionalities
 """
+import errno
 from io import StringIO
 import mmap
 import os
@@ -360,3 +361,65 @@ def test_unknown_engine(self):
             df.to_csv(path)
             with pytest.raises(ValueError, match="Unknown engine"):
                 pd.read_csv(path, engine="pyt")
+
+
+@pytest.mark.parametrize(
+    "reader",
+    [
+        pd.read_csv,
+        pd.read_table,
+        pd.read_fwf,
+        pytest.param(pd.read_excel, marks=[td.skip_if_no("xlrd")]),
+        pytest.param(
+            pd.read_json,
+            marks=[
+                pytest.mark.xfail(
+                    reason=(
+                        "Needs to distinguish between a path to read "
+                        "vs a string to parse"
+                    )
+                )
+            ],
+        ),
+        pd.read_pickle,
+        pd.read_stata,
+        pd.read_sas,
+    ],
+)
+def test_errno_set_nonexistent(reader):
+    # GH#13872 ensure that errno is set on file not found error
+    # This also serves as a check that we raise the correct type of error
+    try:
+        reader("nonexistent_name")
+    except FileNotFoundError as err:
+        assert err.errno == errno.ENOENT
+
+        # GH#29125 consistent error messages across read_* functions
+        assert "File nonexistent_name does not exist: 'nonexistent_name'" in str(err)
+
+
+@pytest.mark.parametrize(
+    "reader",
+    [
+        pd.read_csv,
+        pd.read_table,
+        pd.read_fwf,
+        pytest.param(pd.read_excel, marks=[td.skip_if_no("xlrd")]),
+        pd.read_json,
+        pd.read_pickle,
+        pd.read_stata,
+        pd.read_sas,
+    ],
+)
+@pytest.mark.skip(is_platform_windows(), reason="permissions work differently")
+def test_errno_set_permissions(reader):
+    # GH#23784
+    # make sure we get permiissions error when we try to read without permission
+    with tm.ensure_clean() as path:
+        os.chmod(path, 0o000)
+        try:
+            reader(path)
+        except OSError as err:
+            assert err.errno == errno.EACCES
+        finally:
+            os.chmod(path, 0o777)