Skip to content

Commit

Permalink
Add reset metadata origin option (#3731)
Browse files Browse the repository at this point in the history
* ✨ Add reset_metadata paramter to .load

* wip

* wip

* adding option to keep origin

* Mojmirs suggestions

---------

Co-authored-by: Marigold <mojmir.vinkler@gmail.com>
  • Loading branch information
spoonerf and Marigold authored Dec 19, 2024
1 parent df10676 commit 7ae6528
Showing 1 changed file with 21 additions and 2 deletions.
23 changes: 21 additions & 2 deletions lib/catalog/owid/catalog/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from owid.repack import to_safe_types

from . import tables, utils
from .meta import SOURCE_EXISTS_OPTIONS, DatasetMeta, TableMeta
from .meta import SOURCE_EXISTS_OPTIONS, DatasetMeta, TableMeta, VariableMeta
from .processing_log import disable_processing_log
from .properties import metadata_property

Expand Down Expand Up @@ -155,14 +155,24 @@ def add(
table_filename = join(self.path, table.metadata.checked_name + f".{format}")
table.to(table_filename, repack=repack)

def read(self, name: str, reset_index: bool = True, safe_types: bool = True) -> tables.Table:
def read(
self,
name: str,
reset_index: bool = True,
safe_types: bool = True,
reset_metadata: Literal["keep", "keep_origins", "reset"] = "keep",
) -> tables.Table:
"""Read dataset's table from disk. Alternative to ds[table_name], but
with more options to optimize the reading.
:param reset_index: If true, don't set primary keys of the table. This can make loading
large datasets with multi-indexes much faster.
:param safe_types: If true, convert numeric columns to Float64 and Int64 and categorical
columns to string[pyarrow]. This can significantly increase memory usage.
:param reset_metadata: Controls variable metadata reset behavior.
- "keep": Leave metadata unchanged (default).
- "keep_origins": Reset variable metadata but retain the 'origins' attribute.
- "reset": Reset all variable metadata.
"""
stem = self.path / Path(name)

Expand All @@ -173,6 +183,15 @@ def read(self, name: str, reset_index: bool = True, safe_types: bool = True) ->
t.metadata.dataset = self.metadata
if safe_types:
t = cast(tables.Table, to_safe_types(t))
if reset_metadata in ["keep_origins", "reset"]: # Handles "keep_origins" and "reset"
t.metadata = TableMeta()
for col in t.columns:
if reset_metadata == "keep_origins": # Preserve 'origins' attribute
origins = t[col].metadata.origins if hasattr(t[col].metadata, "origins") else None
t[col].metadata = VariableMeta()
t[col].metadata.origins = origins # Preserve 'origins' attribute
if reset_metadata == "reset": # Reset all metadata
t[col].metadata = VariableMeta()
return t

raise KeyError(f"Table `{name}` not found, available tables: {', '.join(self.table_names)}")
Expand Down

0 comments on commit 7ae6528

Please sign in to comment.