Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add private function _validate_data_input to validate input data #2595

Merged
merged 4 commits into from
Jul 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 69 additions & 15 deletions pygmt/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,74 @@
from pygmt.exceptions import GMTInvalidInput


def data_kind(data, x=None, y=None, z=None, required_z=False):
def _validate_data_input(
data=None, x=None, y=None, z=None, required_z=False, kind=None
):
"""
Check if the combination of data/x/y/z is valid.

Examples
--------
>>> _validate_data_input(data="infile")
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6])
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6], z=[7, 8, 9])
>>> _validate_data_input()
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: No input data provided.
>>> _validate_data_input(x=[1, 2, 3])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide both x and y.
>>> _validate_data_input(y=[4, 5, 6])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide both x and y.
>>> _validate_data_input(x=[1, 2, 3], y=[4, 5, 6], required_z=True)
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Must provide x, y, and z.
>>> _validate_data_input(data="infile", x=[1, 2, 3])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
>>> _validate_data_input(data="infile", y=[4, 5, 6])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.
>>> _validate_data_input(data="infile", z=[7, 8, 9])
Traceback (most recent call last):
...
pygmt.exceptions.GMTInvalidInput: Too much data. Use either data or x/y/z.

Raises
------
GMTInvalidInput
If the data input is not valid.
"""
if data is None: # data is None
if x is None and y is None: # both x and y are None
raise GMTInvalidInput("No input data provided.")
if x is None or y is None: # either x or y is None
raise GMTInvalidInput("Must provide both x and y.")
# both x and y are not None, now check z
if required_z and z is None:
raise GMTInvalidInput("Must provide x, y, and z.")
else: # data is not None
if x is not None or y is not None or z is not None:
raise GMTInvalidInput("Too much data. Use either data or x/y/z.")
Comment on lines +70 to +72
Copy link
Member

@weiji14 weiji14 Jul 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It makes more sense to also move the following lines to the _validate_data_input function, but it's not easy.

if required_z and (
getattr(data, "shape", (3, 3))[1] < 3 # np.array, pd.DataFrame
or len(getattr(data, "data_vars", (0, 1, 2))) < 3 # xr.Dataset
):
raise GMTInvalidInput("data must provide x, y, and z columns.")

Hmm, can't we just add it here?

Suggested change
else: # data is not None
if x is not None or y is not None or z is not None:
raise GMTInvalidInput("Too much data. Use either data or x/y/z.")
else: # data is not None
if x is not None or y is not None or z is not None:
raise GMTInvalidInput("Too much data. Use either data or x/y/z.")
if required_z and (
getattr(data, "shape", (3, 3))[1] < 3 # np.array, pd.DataFrame
or len(getattr(data, "data_vars", (0, 1, 2))) < 3 # xr.Dataset
):
raise GMTInvalidInput("data must provide x, y, and z columns.")

Edit: Oh wait, I see, we also need to check that data is not str/pathlib.PurePath/xr.DataArray/GeoJSON... But maybe we can do hasattr(data, "shape") or hasattr(data, "data_vars") first?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But maybe we can do hasattr(data, "shape") or hasattr(data, "data_vars") first?

Sounds promising.

Copy link
Member

@weiji14 weiji14 Jul 5, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Haven't tested this, but maybe something like this:

Suggested change
else: # data is not None
if x is not None or y is not None or z is not None:
raise GMTInvalidInput("Too much data. Use either data or x/y/z.")
else: # data is not None
if x is not None or y is not None or z is not None:
raise GMTInvalidInput("Too much data. Use either data or x/y/z.")
if (
hasattr(data, "shape") # np.array, pd.DataFrame
and getattr(data, "shape", (3, 3))[1] < 3
) or (
hasattr(data, "data_vars") # xr.Dataset
and len(getattr(data, "data_vars", (0, 1, 2))) < 3
):
raise GMTInvalidInput("data must provide x, y, and z columns.")

Can't remember why I added a default value with getattr at #1523, might be able to remove it if not needed, but test first.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't remember why I added a default value with getattr at #1523,

getattr(data, "shape", (3, 3)) returns (3,3) if data doesn't has the shape attribute, so it won't raise the exception.

With the above changes, test_geopandas_plot3d_default_cube will fail, because in this test:

>>> multipoint = shapely.geometry.MultiPoint([(0.5, 0.5, 0.5), (1.5, 1.5, 1.5)])
>>> gdf = gpd.GeoDataFrame(geometry=[multipoint])
>>> gdf.shape
(1, 1)

gdf has a shape (1,1).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a not hasattr(data, "__geo_interface__") check perhaps?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In ca2d49f, I changed the order of some codes to determine the data kind first and then check if the data input is valid. It works well.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, that works too. Can you fix the pylint error? Should be good after that.

# For 'matrix' kind, check if data has the required z column
if kind == "matrix" and required_z:
if hasattr(data, "shape"): # np.ndarray or pd.DataFrame
if len(data.shape) == 1 and data.shape[0] < 3:
raise GMTInvalidInput("data must provide x, y, and z columns.")
if len(data.shape) > 1 and data.shape[1] < 3:
raise GMTInvalidInput("data must provide x, y, and z columns.")
if hasattr(data, "data_vars") and len(data.data_vars) < 3: # xr.Dataset
raise GMTInvalidInput("data must provide x, y, and z columns.")


def data_kind(data=None, x=None, y=None, z=None, required_z=False):
"""
Check what kind of data is provided to a module.

Expand Down Expand Up @@ -64,30 +131,17 @@ def data_kind(data, x=None, y=None, z=None, required_z=False):
>>> data_kind(data=xr.DataArray(np.random.rand(4, 3)))
'grid'
"""
if data is None and x is None and y is None:
raise GMTInvalidInput("No input data provided.")
if data is not None and (x is not None or y is not None or z is not None):
raise GMTInvalidInput("Too much data. Use either data or x and y.")
if data is None and (x is None or y is None):
raise GMTInvalidInput("Must provide both x and y.")
if data is None and required_z and z is None:
raise GMTInvalidInput("Must provide x, y, and z.")

if isinstance(data, (str, pathlib.PurePath)):
kind = "file"
elif isinstance(data, xr.DataArray):
kind = "grid"
elif hasattr(data, "__geo_interface__"):
kind = "geojson"
elif data is not None:
if required_z and (
getattr(data, "shape", (3, 3))[1] < 3 # np.array, pd.DataFrame
or len(getattr(data, "data_vars", (0, 1, 2))) < 3 # xr.Dataset
):
raise GMTInvalidInput("data must provide x, y, and z columns.")
kind = "matrix"
else:
kind = "vectors"
_validate_data_input(data=data, x=x, y=y, z=z, required_z=required_z, kind=kind)
return kind


Expand Down