Skip to content

Commit

Permalink
Project Catalog optional check_valid, drop_duplicates (#536)
Browse files Browse the repository at this point in the history
<!-- Please ensure the PR fulfills the following requirements! -->
<!-- If this is your first PR, make sure to add your details to the
AUTHORS.rst! -->
### Pull Request Checklist:
- [x] This PR addresses an already opened issue (for bug fixes /
features)
    - This PR fixes #535 
- [x] (If applicable) Documentation has been added / updated (for bug
fixes / features).
- [ ] (If applicable) Tests have been added.
- [ ] This PR does not seem to break the templates.
- [x] CHANGELOG.rst has been updated (with summary of main changes).
- [x] Link to issue (:issue:`number`) and pull request (:pull:`number`)
has been added.

### What kind of change does this PR introduce?

### Does this PR introduce a breaking change?

- No
### Other information:
  • Loading branch information
SarahG-579462 authored Feb 26, 2025
2 parents 7ca21a3 + 490138c commit 7d47d9e
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 14 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ Changelog

v0.12.0 (unreleased)
--------------------
Contributors to this version: Trevor James Smith (:user:`Zeitsperre`), Pascal Bourgault (:user:`aulemahal`), Juliette Lavoie (:user:`juliettelavoie`).
Contributors to this version: Trevor James Smith (:user:`Zeitsperre`), Pascal Bourgault (:user:`aulemahal`), Juliette Lavoie (:user:`juliettelavoie`), Sarah Gammon (:user:`SarahG-579462`).

Breaking changes
^^^^^^^^^^^^^^^^
Expand All @@ -14,6 +14,7 @@ Breaking changes
New features and enhancements
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
* Include station-obs and forecasts in the derived schema for `build_path`. (:pull:`534`).
* Project catalog now allows `check_valid` and `drop_duplicates` keyword arguments. (:pull:`536`, :issue:`535`).

Bug fixes
^^^^^^^^^
Expand Down
46 changes: 33 additions & 13 deletions src/xscen/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,9 +719,11 @@ def __init__(
create: bool = False,
overwrite: bool = False,
project: dict | None = None,
check_valid: bool = True,
drop_duplicates: bool = True,
**kwargs,
):
"""
r"""
Open or create a project catalog.
Parameters
Expand All @@ -735,6 +737,13 @@ def __init__(
If this and 'create' are True, this will overwrite any existing JSON and CSV file with an empty catalog.
project : dict, optional
Metadata to create the catalog, if required.
check_valid : bool
If True (default), will check that all files in the catalog exist on disk and remove those that don't.
drop_duplicates : bool
If True (default), will drop duplicates in the catalog based on the 'id' and 'path' columns.
\**kwargs : dict
Any other arguments are passed to xscen.catalog.DataCatalog.
Notes
-----
Expand All @@ -746,9 +755,15 @@ def __init__(
if create:
if isinstance(df, str | Path) and (not Path(df).is_file() or overwrite):
self.create(df, project=project, overwrite=overwrite)
super().__init__(df, *args, **kwargs)
self.check_valid()
self.drop_duplicates()
super().__init__(
df,
*args,
check_valid=check_valid,
drop_duplicates=drop_duplicates,
**kwargs,
)
self.check_valid_flag = check_valid
self.drop_duplicates_flag = drop_duplicates
self.meta_file = df if not isinstance(df, dict) else None

# TODO: Implement a way to easily destroy part of the catalog to "reset" some steps
Expand All @@ -772,7 +787,7 @@ def update(
Warnings
--------
If a file was deleted between the parsing of the catalog and this call,
it will be removed from the csv when `check_valid` is called.
it will be removed from the csv if `check_valid` is called.
Parameters
----------
Expand All @@ -786,9 +801,10 @@ def update(
if isinstance(df, pd.Series):
df = pd.DataFrame(df).transpose()
self.esmcat._df = pd.concat([self.df, df])

self.check_valid()
self.drop_duplicates()
if self.check_valid_flag:
self.check_valid()
if self.drop_duplicates_flag:
self.drop_duplicates()

# make sure year really has 4 digits
if "date_start" in self.df:
Expand Down Expand Up @@ -834,8 +850,10 @@ def update(
}
)
disk_cat.esmcat._df = pd.concat([disk_cat.df, df_str])
disk_cat.check_valid()
disk_cat.drop_duplicates()
if self.check_valid_flag:
disk_cat.check_valid()
if self.drop_duplicates_flag:
disk_cat.drop_duplicates()
with fs.open(disk_cat.esmcat.catalog_file, "wb") as csv_outfile:
disk_cat.df.to_csv(csv_outfile, index=False, compression=None)

Expand All @@ -858,7 +876,7 @@ def update_from_ds(
Warnings
--------
If a file was deleted between the parsing of the catalog and this call,
it will be removed from the csv when `check_valid` is called.
it will be removed from the csv if `check_valid` is called.
Parameters
----------
Expand Down Expand Up @@ -910,8 +928,10 @@ def refresh(self):
self.meta_file, read_csv_kwargs=self.read_csv_kwargs
)
initlen = len(self.esmcat.df)
self.check_valid()
self.drop_duplicates()
if self.check_valid_flag:
self.check_valid()
if self.drop_duplicates_flag:
self.drop_duplicates()
if len(self.df) != initlen:
self.update()

Expand Down

0 comments on commit 7d47d9e

Please sign in to comment.