diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 18f8858748df5..bf94d647d2db7 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -998,6 +998,7 @@ I/O - Bug in :meth:`DataFrame.to_html` in which there was no validation of the ``justify`` parameter (:issue:`17527`) - Bug in :func:`HDFStore.select` when reading a contiguous mixed-data table featuring VLArray (:issue:`17021`) - Bug in :func:`to_json` where several conditions (including objects with unprintable symbols, objects with deep recursion, overlong labels) caused segfaults instead of raising the appropriate exception (:issue:`14256`) +- Bug in :func:`read_json` where all utf-8 characters were not encoded properly when reading json data from a url (:issue:`17918`) Plotting ^^^^^^^^ diff --git a/pandas/io/common.py b/pandas/io/common.py index 534c1e0671150..e60b9176dd55e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -29,7 +29,7 @@ if compat.PY3: - from urllib.request import urlopen, pathname2url + from urllib.request import urlopen, pathname2url, quote _urlopen = urlopen from urllib.parse import urlparse as parse_url from urllib.parse import (uses_relative, uses_netloc, uses_params, @@ -38,7 +38,7 @@ from http.client import HTTPException # noqa else: from urllib2 import urlopen as _urlopen - from urllib import urlencode, pathname2url # noqa + from urllib import urlencode, pathname2url, quote # noqa from urlparse import urlparse as parse_url from urlparse import uses_relative, uses_netloc, uses_params, urljoin from urllib2 import URLError # noqa @@ -187,6 +187,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, filepath_or_buffer = _stringify_path(filepath_or_buffer) if _is_url(filepath_or_buffer): + filepath_or_buffer = quote(filepath_or_buffer, safe=';/?:@&=+$,') req = _urlopen(filepath_or_buffer) content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 6625446bea469..38bb04e12e755 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -845,12 +845,20 @@ def test_round_trip_exception_(self): index=df.index, columns=df.columns), df) @network - def test_url(self): + def test_url_encoded(self): url = 'https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5' # noqa result = read_json(url, convert_dates=True) for c in ['created_at', 'closed_at', 'updated_at']: assert result[c].dtype == 'datetime64[ns]' + @network + def test_url_unencoded(self): + url = ('https://api.github.com/repos/pandas-dev/pandas/issues?per_pag' + 'e=5&test=fake parameter') + result = read_json(url, convert_dates=True) + for c in ['created_at', 'closed_at', 'updated_at']: + assert result[c].dtype == 'datetime64[ns]' + def test_timedelta(self): converter = lambda x: pd.to_timedelta(x, unit='ms')