From 8301fa621c1739cc1afd821b2899b27464bf2197 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 26 Jun 2018 16:42:56 -0400 Subject: [PATCH 01/35] POC of making a single file out of zarr dot files --- zarr/convenience.py | 21 +++++++++++++++++++++ zarr/tests/test_convenience.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 19de7b2826..2c06dac732 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1069,3 +1069,24 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) return n_copied, n_skipped, n_bytes_copied + + +def consolidate_metadata(mapping, out_key='.zmetadata'): + """ + Read all the metadata in the files within the given dataset and join + + Parameters + ---------- + mapping : MutableMapping instance + Containing metadata and data keys of a zarr dataset + out_key : str + Key to place the consolidated data into + """ + import json + + def is_zarr_key(key): + return (key.endswith('.zarray') or key.endswith('.zgroup') or + key.endswith('.zattrs')) + + out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} + mapping[out_key] = json.dumps(out) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index c77006c4f6..cae105c23e 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -12,8 +12,8 @@ import pytest -from zarr.convenience import open, save, save_group, load, copy_store, copy -from zarr.storage import atexit_rmtree +from zarr.convenience import open, save, save_group, load, copy_store, copy, consolidate_metadata +from zarr.storage import atexit_rmtree, DictStore from zarr.core import Array from zarr.hierarchy import Group, group from zarr.errors import CopyError @@ -91,6 +91,34 @@ def test_lazy_loader(): assert_array_equal(bar, loader['bar']) +def test_consolidate_metadata(): + import json + store = DictStore() + z = group(store) + z.create_group('g1') + g2 = z.create_group('g2') + g2.attrs['hello'] = 'world' + arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') + arr.attrs['data'] = 1 + arr[:] = 1.0 + consolidate_metadata(store) + assert '.zmetadata' in store + for key in ['.zgroup', + 'g1/.zgroup', + 'g2/.zgroup', + 'g2/.zattrs', + 'g2/arr/.zarray', + 'g2/arr/.zattrs']: + del store[key] + meta = json.loads(store['.zmetadata']) + meta = {k: v.encode() for k, v in meta.items()} + z2 = group(meta, chunk_store=store) + assert list(z2) == ['g1', 'g2'] + assert z2.g2.attrs['hello'] == 'world' + assert z2.g2.arr.attrs['data'] == 1 + assert (z2.g2.arr[:] == 1.0).all() + + class TestCopyStore(unittest.TestCase): def setUp(self): From be6d70650d86b250674d926693e64dd6792b16a6 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 2 Jul 2018 14:46:58 -0400 Subject: [PATCH 02/35] (WIP) include simple code that would load metadata Again, this is for example only, not intended final structure --- zarr/convenience.py | 2 +- zarr/hierarchy.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 2c06dac732..7aac2e385c 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1089,4 +1089,4 @@ def is_zarr_key(key): key.endswith('.zattrs')) out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} - mapping[out_key] = json.dumps(out) + mapping[out_key] = json.dumps(out).encode() diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index e9565caa13..9e401eed69 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -92,6 +92,14 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): + try: + import json + metadata = json.loads(store['.zmetadata']) + meta_store = {k: v.encode() for k, v in metadata.items()} + chunk_store, store = store, meta_store + except (KeyError, ValueError, json.JSONDecodeError): + pass + self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) From f1128ff92fd780f6a5d2fcc1351b8f6ae794c609 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:26:14 -0400 Subject: [PATCH 03/35] Implement ConsolidatedMetadataStore --- zarr/convenience.py | 17 ++++++++- zarr/hierarchy.py | 9 ----- zarr/storage.py | 64 ++++++++++++++++++++++++++++++++++ zarr/tests/test_convenience.py | 11 +++--- 4 files changed, 86 insertions(+), 15 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 7aac2e385c..db3fa2f85a 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1073,7 +1073,16 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, def consolidate_metadata(mapping, out_key='.zmetadata'): """ - Read all the metadata in the files within the given dataset and join + Store all the metadata in the files within the given dataset in one key + + This produces a single file in the backend store, containing all the + metadata read from all the zarr-related keys that can be found. This + should be used in conjunction with ``storage.ConsolidatedMetadataStore`` + to reduce the number of operations on the backend store at read time. + + Note, however, that if the dataset is changed after this consolidation, + then the metadata read by ``storage.ConsolidatedMetadataStore`` would + be out of sync with reality unless this function is called again. Parameters ---------- @@ -1081,8 +1090,13 @@ def consolidate_metadata(mapping, out_key='.zmetadata'): Containing metadata and data keys of a zarr dataset out_key : str Key to place the consolidated data into + + Returns + ------- + ConsolidatedMetadataStore instance, based on the same base store. """ import json + from .storage import ConsolidatedMetadataStore def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or @@ -1090,3 +1104,4 @@ def is_zarr_key(key): out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} mapping[out_key] = json.dumps(out).encode() + return ConsolidatedMetadataStore(mapping, out_key) diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 9e401eed69..f20b899b2b 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -91,15 +91,6 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): - - try: - import json - metadata = json.loads(store['.zmetadata']) - meta_store = {k: v.encode() for k, v in metadata.items()} - chunk_store, store = store, meta_store - except (KeyError, ValueError, json.JSONDecodeError): - pass - self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) diff --git a/zarr/storage.py b/zarr/storage.py index 39a497d08b..a8ed34773f 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -24,6 +24,7 @@ import atexit import re import sys +import json import multiprocessing from threading import Lock, RLock import glob @@ -1883,3 +1884,66 @@ def __delitem__(self, key): with self._mutex: self._invalidate_keys() self._invalidate_value(key) + + +class ConsolidatedMetadataStore(MutableMapping): + """A layer over other storage, with the metadata within a single key + + The purpose of this class, is to be able to get all of the metadata for + a given dataset in a single read operation from the underlying storage. + See ``convenience.consolidate_metadata()`` for how to create this single + metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the dataset metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + ``conslidate_metadata`` should be called again and the class re-invoked. + The use case is for write once, read many times. + + """ + def __init__(self, store, metadata_key='.zmetadata'): + """ + + Parameters + ---------- + store: MutableMapping + Containing the zarr dataset + metadata_key: str + The target in the store where all of the metadata are stores. We + assume JSON encoding. + """ + self.store = store + metadata = json.loads(store[metadata_key]) + self.meta_store = {k: v.encode() for k, v in metadata.items()} + + def __getitem__(self, key): + """Try local dict before falling back to real storage""" + try: + return self.meta_store[key] + except KeyError: + return self.store[key] + + def __iter__(self): + """Only list local keys - data must be got via getitem""" + return iter(self.meta_store) + + def __len__(self): + """Only len of local keys""" + return len(self.meta_store) + + def __delitem__(self, key): + """Data can be deleted from storage""" + if key not in self: + del self.store[key] + else: + raise NotImplementedError + + def __setitem__(self, key, value): + """Data can be written to storage""" + if key not in self: + self.store[key] = value + else: + raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index cae105c23e..92984f95c1 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -92,7 +92,7 @@ def test_lazy_loader(): def test_consolidate_metadata(): - import json + from zarr.storage import ConsolidatedMetadataStore store = DictStore() z = group(store) z.create_group('g1') @@ -101,7 +101,8 @@ def test_consolidate_metadata(): arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') arr.attrs['data'] = 1 arr[:] = 1.0 - consolidate_metadata(store) + out = consolidate_metadata(store) + assert isinstance(out, ConsolidatedMetadataStore) assert '.zmetadata' in store for key in ['.zgroup', 'g1/.zgroup', @@ -110,13 +111,13 @@ def test_consolidate_metadata(): 'g2/arr/.zarray', 'g2/arr/.zattrs']: del store[key] - meta = json.loads(store['.zmetadata']) - meta = {k: v.encode() for k, v in meta.items()} - z2 = group(meta, chunk_store=store) + cstore = ConsolidatedMetadataStore(store) + z2 = open(cstore, mode='r') assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() + assert list(out) class TestCopyStore(unittest.TestCase): From 66663912edde64d6ec47b9627b84f7084b0c147e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:41:07 -0400 Subject: [PATCH 04/35] fix for py34 py35 --- zarr/storage.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index a8ed34773f..8494427ea7 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1912,11 +1912,15 @@ def __init__(self, store, metadata_key='.zmetadata'): store: MutableMapping Containing the zarr dataset metadata_key: str - The target in the store where all of the metadata are stores. We + The target in the store where all of the metadata are stored. We assume JSON encoding. """ self.store = store - metadata = json.loads(store[metadata_key]) + if sys.version_info.major == 3 and sys.version_info.minor < 6: + d = store[metadata_key].decode() + else: + d = store[metadata_key] + metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} def __getitem__(self, key): From a369073d57436683aba4d24200c35ca67bb21f85 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 13:34:24 -0400 Subject: [PATCH 05/35] improve coverage; data write in consolidated store --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 8494427ea7..ce3b0c8b4d 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1940,14 +1940,14 @@ def __len__(self): def __delitem__(self, key): """Data can be deleted from storage""" - if key not in self: + if key not in self.meta_store: del self.store[key] else: raise NotImplementedError def __setitem__(self, key, value): """Data can be written to storage""" - if key not in self: + if key not in self.meta_store: self.store[key] = value else: raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 92984f95c1..379a039e01 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -112,12 +112,18 @@ def test_consolidate_metadata(): 'g2/arr/.zattrs']: del store[key] cstore = ConsolidatedMetadataStore(store) - z2 = open(cstore, mode='r') + z2 = open(cstore) assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() - assert list(out) + assert list(out) == list(cstore) + + # tests del/write on the store + del cstore['g2/arr/0.0'] + assert (z2.g2.arr[:] == 0).all() + z2.g2.arr[:] = 2 + assert (z2.g2.arr[:] == 2).all() class TestCopyStore(unittest.TestCase): From 96e1fb0b963b0f6a3164c6977a1a2dc30f1792b9 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 15:37:59 -0400 Subject: [PATCH 06/35] coverage --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index ce3b0c8b4d..ece3c11677 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1917,8 +1917,8 @@ def __init__(self, store, metadata_key='.zmetadata'): """ self.store = store if sys.version_info.major == 3 and sys.version_info.minor < 6: - d = store[metadata_key].decode() - else: + d = store[metadata_key].decode() # pragma: no cover + else: # pragma: no cover d = store[metadata_key] metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 379a039e01..62da7d4b77 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -120,6 +120,10 @@ def test_consolidate_metadata(): assert list(out) == list(cstore) # tests del/write on the store + with pytest.raises(NotImplementedError): + del cstore['.zgroup'] + with pytest.raises(NotImplementedError): + cstore['.zgroup'] = None del cstore['g2/arr/0.0'] assert (z2.g2.arr[:] == 0).all() z2.g2.arr[:] = 2 From 7a5c81d368c5e2584aa0eac586b0912645abcc81 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 26 Jun 2018 16:42:56 -0400 Subject: [PATCH 07/35] POC of making a single file out of zarr dot files --- zarr/convenience.py | 21 +++++++++++++++++++++ zarr/tests/test_convenience.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 19de7b2826..2c06dac732 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1069,3 +1069,24 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) return n_copied, n_skipped, n_bytes_copied + + +def consolidate_metadata(mapping, out_key='.zmetadata'): + """ + Read all the metadata in the files within the given dataset and join + + Parameters + ---------- + mapping : MutableMapping instance + Containing metadata and data keys of a zarr dataset + out_key : str + Key to place the consolidated data into + """ + import json + + def is_zarr_key(key): + return (key.endswith('.zarray') or key.endswith('.zgroup') or + key.endswith('.zattrs')) + + out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} + mapping[out_key] = json.dumps(out) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index c77006c4f6..cae105c23e 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -12,8 +12,8 @@ import pytest -from zarr.convenience import open, save, save_group, load, copy_store, copy -from zarr.storage import atexit_rmtree +from zarr.convenience import open, save, save_group, load, copy_store, copy, consolidate_metadata +from zarr.storage import atexit_rmtree, DictStore from zarr.core import Array from zarr.hierarchy import Group, group from zarr.errors import CopyError @@ -91,6 +91,34 @@ def test_lazy_loader(): assert_array_equal(bar, loader['bar']) +def test_consolidate_metadata(): + import json + store = DictStore() + z = group(store) + z.create_group('g1') + g2 = z.create_group('g2') + g2.attrs['hello'] = 'world' + arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') + arr.attrs['data'] = 1 + arr[:] = 1.0 + consolidate_metadata(store) + assert '.zmetadata' in store + for key in ['.zgroup', + 'g1/.zgroup', + 'g2/.zgroup', + 'g2/.zattrs', + 'g2/arr/.zarray', + 'g2/arr/.zattrs']: + del store[key] + meta = json.loads(store['.zmetadata']) + meta = {k: v.encode() for k, v in meta.items()} + z2 = group(meta, chunk_store=store) + assert list(z2) == ['g1', 'g2'] + assert z2.g2.attrs['hello'] == 'world' + assert z2.g2.arr.attrs['data'] == 1 + assert (z2.g2.arr[:] == 1.0).all() + + class TestCopyStore(unittest.TestCase): def setUp(self): From 0711920696d3b0e717c7a851dfe7308d6955c146 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 2 Jul 2018 14:46:58 -0400 Subject: [PATCH 08/35] (WIP) include simple code that would load metadata Again, this is for example only, not intended final structure --- zarr/convenience.py | 2 +- zarr/hierarchy.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 2c06dac732..7aac2e385c 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1089,4 +1089,4 @@ def is_zarr_key(key): key.endswith('.zattrs')) out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} - mapping[out_key] = json.dumps(out) + mapping[out_key] = json.dumps(out).encode() diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index e9565caa13..9e401eed69 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -92,6 +92,14 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): + try: + import json + metadata = json.loads(store['.zmetadata']) + meta_store = {k: v.encode() for k, v in metadata.items()} + chunk_store, store = store, meta_store + except (KeyError, ValueError, json.JSONDecodeError): + pass + self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) From f8e6a2f62fdcd54604d9fde532977572418b0884 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:26:14 -0400 Subject: [PATCH 09/35] Implement ConsolidatedMetadataStore --- zarr/convenience.py | 17 ++++++++- zarr/hierarchy.py | 9 ----- zarr/storage.py | 64 ++++++++++++++++++++++++++++++++++ zarr/tests/test_convenience.py | 11 +++--- 4 files changed, 86 insertions(+), 15 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 7aac2e385c..db3fa2f85a 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1073,7 +1073,16 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, def consolidate_metadata(mapping, out_key='.zmetadata'): """ - Read all the metadata in the files within the given dataset and join + Store all the metadata in the files within the given dataset in one key + + This produces a single file in the backend store, containing all the + metadata read from all the zarr-related keys that can be found. This + should be used in conjunction with ``storage.ConsolidatedMetadataStore`` + to reduce the number of operations on the backend store at read time. + + Note, however, that if the dataset is changed after this consolidation, + then the metadata read by ``storage.ConsolidatedMetadataStore`` would + be out of sync with reality unless this function is called again. Parameters ---------- @@ -1081,8 +1090,13 @@ def consolidate_metadata(mapping, out_key='.zmetadata'): Containing metadata and data keys of a zarr dataset out_key : str Key to place the consolidated data into + + Returns + ------- + ConsolidatedMetadataStore instance, based on the same base store. """ import json + from .storage import ConsolidatedMetadataStore def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or @@ -1090,3 +1104,4 @@ def is_zarr_key(key): out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} mapping[out_key] = json.dumps(out).encode() + return ConsolidatedMetadataStore(mapping, out_key) diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 9e401eed69..f20b899b2b 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -91,15 +91,6 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): - - try: - import json - metadata = json.loads(store['.zmetadata']) - meta_store = {k: v.encode() for k, v in metadata.items()} - chunk_store, store = store, meta_store - except (KeyError, ValueError, json.JSONDecodeError): - pass - self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) diff --git a/zarr/storage.py b/zarr/storage.py index 8b551d1254..440996d0c6 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -24,6 +24,7 @@ import atexit import re import sys +import json import multiprocessing from threading import Lock, RLock import glob @@ -1887,3 +1888,66 @@ def __delitem__(self, key): with self._mutex: self._invalidate_keys() self._invalidate_value(key) + + +class ConsolidatedMetadataStore(MutableMapping): + """A layer over other storage, with the metadata within a single key + + The purpose of this class, is to be able to get all of the metadata for + a given dataset in a single read operation from the underlying storage. + See ``convenience.consolidate_metadata()`` for how to create this single + metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the dataset metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + ``conslidate_metadata`` should be called again and the class re-invoked. + The use case is for write once, read many times. + + """ + def __init__(self, store, metadata_key='.zmetadata'): + """ + + Parameters + ---------- + store: MutableMapping + Containing the zarr dataset + metadata_key: str + The target in the store where all of the metadata are stores. We + assume JSON encoding. + """ + self.store = store + metadata = json.loads(store[metadata_key]) + self.meta_store = {k: v.encode() for k, v in metadata.items()} + + def __getitem__(self, key): + """Try local dict before falling back to real storage""" + try: + return self.meta_store[key] + except KeyError: + return self.store[key] + + def __iter__(self): + """Only list local keys - data must be got via getitem""" + return iter(self.meta_store) + + def __len__(self): + """Only len of local keys""" + return len(self.meta_store) + + def __delitem__(self, key): + """Data can be deleted from storage""" + if key not in self: + del self.store[key] + else: + raise NotImplementedError + + def __setitem__(self, key, value): + """Data can be written to storage""" + if key not in self: + self.store[key] = value + else: + raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index cae105c23e..92984f95c1 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -92,7 +92,7 @@ def test_lazy_loader(): def test_consolidate_metadata(): - import json + from zarr.storage import ConsolidatedMetadataStore store = DictStore() z = group(store) z.create_group('g1') @@ -101,7 +101,8 @@ def test_consolidate_metadata(): arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') arr.attrs['data'] = 1 arr[:] = 1.0 - consolidate_metadata(store) + out = consolidate_metadata(store) + assert isinstance(out, ConsolidatedMetadataStore) assert '.zmetadata' in store for key in ['.zgroup', 'g1/.zgroup', @@ -110,13 +111,13 @@ def test_consolidate_metadata(): 'g2/arr/.zarray', 'g2/arr/.zattrs']: del store[key] - meta = json.loads(store['.zmetadata']) - meta = {k: v.encode() for k, v in meta.items()} - z2 = group(meta, chunk_store=store) + cstore = ConsolidatedMetadataStore(store) + z2 = open(cstore, mode='r') assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() + assert list(out) class TestCopyStore(unittest.TestCase): From c4436c749c36e2bdd039518793d24b94b69ef8ee Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:41:07 -0400 Subject: [PATCH 10/35] fix for py34 py35 --- zarr/storage.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 440996d0c6..5c9aa0d76e 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1916,11 +1916,15 @@ def __init__(self, store, metadata_key='.zmetadata'): store: MutableMapping Containing the zarr dataset metadata_key: str - The target in the store where all of the metadata are stores. We + The target in the store where all of the metadata are stored. We assume JSON encoding. """ self.store = store - metadata = json.loads(store[metadata_key]) + if sys.version_info.major == 3 and sys.version_info.minor < 6: + d = store[metadata_key].decode() + else: + d = store[metadata_key] + metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} def __getitem__(self, key): From 0757a72daccac7924c20bf4ee539b027cefa8d9e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 13:34:24 -0400 Subject: [PATCH 11/35] improve coverage; data write in consolidated store --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 5c9aa0d76e..3e211c608a 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1944,14 +1944,14 @@ def __len__(self): def __delitem__(self, key): """Data can be deleted from storage""" - if key not in self: + if key not in self.meta_store: del self.store[key] else: raise NotImplementedError def __setitem__(self, key, value): """Data can be written to storage""" - if key not in self: + if key not in self.meta_store: self.store[key] = value else: raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 92984f95c1..379a039e01 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -112,12 +112,18 @@ def test_consolidate_metadata(): 'g2/arr/.zattrs']: del store[key] cstore = ConsolidatedMetadataStore(store) - z2 = open(cstore, mode='r') + z2 = open(cstore) assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() - assert list(out) + assert list(out) == list(cstore) + + # tests del/write on the store + del cstore['g2/arr/0.0'] + assert (z2.g2.arr[:] == 0).all() + z2.g2.arr[:] = 2 + assert (z2.g2.arr[:] == 2).all() class TestCopyStore(unittest.TestCase): From da3f6d7509ebb1a82f7e2fbc60c7ef7e53df5fac Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 15:37:59 -0400 Subject: [PATCH 12/35] coverage --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 3e211c608a..e3f2506f22 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1921,8 +1921,8 @@ def __init__(self, store, metadata_key='.zmetadata'): """ self.store = store if sys.version_info.major == 3 and sys.version_info.minor < 6: - d = store[metadata_key].decode() - else: + d = store[metadata_key].decode() # pragma: no cover + else: # pragma: no cover d = store[metadata_key] metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 379a039e01..62da7d4b77 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -120,6 +120,10 @@ def test_consolidate_metadata(): assert list(out) == list(cstore) # tests del/write on the store + with pytest.raises(NotImplementedError): + del cstore['.zgroup'] + with pytest.raises(NotImplementedError): + cstore['.zgroup'] = None del cstore['g2/arr/0.0'] assert (z2.g2.arr[:] == 0).all() z2.g2.arr[:] = 2 From 552a0841d2f8a4e83f3ffcaf31d383c55b1f4636 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 26 Jun 2018 16:42:56 -0400 Subject: [PATCH 13/35] POC of making a single file out of zarr dot files --- zarr/convenience.py | 21 +++++++++++++++++++++ zarr/tests/test_convenience.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 19de7b2826..2c06dac732 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1069,3 +1069,24 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, _log_copy_summary(log, dry_run, n_copied, n_skipped, n_bytes_copied) return n_copied, n_skipped, n_bytes_copied + + +def consolidate_metadata(mapping, out_key='.zmetadata'): + """ + Read all the metadata in the files within the given dataset and join + + Parameters + ---------- + mapping : MutableMapping instance + Containing metadata and data keys of a zarr dataset + out_key : str + Key to place the consolidated data into + """ + import json + + def is_zarr_key(key): + return (key.endswith('.zarray') or key.endswith('.zgroup') or + key.endswith('.zattrs')) + + out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} + mapping[out_key] = json.dumps(out) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index c77006c4f6..cae105c23e 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -12,8 +12,8 @@ import pytest -from zarr.convenience import open, save, save_group, load, copy_store, copy -from zarr.storage import atexit_rmtree +from zarr.convenience import open, save, save_group, load, copy_store, copy, consolidate_metadata +from zarr.storage import atexit_rmtree, DictStore from zarr.core import Array from zarr.hierarchy import Group, group from zarr.errors import CopyError @@ -91,6 +91,34 @@ def test_lazy_loader(): assert_array_equal(bar, loader['bar']) +def test_consolidate_metadata(): + import json + store = DictStore() + z = group(store) + z.create_group('g1') + g2 = z.create_group('g2') + g2.attrs['hello'] = 'world' + arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') + arr.attrs['data'] = 1 + arr[:] = 1.0 + consolidate_metadata(store) + assert '.zmetadata' in store + for key in ['.zgroup', + 'g1/.zgroup', + 'g2/.zgroup', + 'g2/.zattrs', + 'g2/arr/.zarray', + 'g2/arr/.zattrs']: + del store[key] + meta = json.loads(store['.zmetadata']) + meta = {k: v.encode() for k, v in meta.items()} + z2 = group(meta, chunk_store=store) + assert list(z2) == ['g1', 'g2'] + assert z2.g2.attrs['hello'] == 'world' + assert z2.g2.arr.attrs['data'] == 1 + assert (z2.g2.arr[:] == 1.0).all() + + class TestCopyStore(unittest.TestCase): def setUp(self): From 8f3325f3ba82dd7ba4c7b3c6b50e878f7ffb86dd Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 2 Jul 2018 14:46:58 -0400 Subject: [PATCH 14/35] (WIP) include simple code that would load metadata Again, this is for example only, not intended final structure --- zarr/convenience.py | 2 +- zarr/hierarchy.py | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 2c06dac732..7aac2e385c 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1089,4 +1089,4 @@ def is_zarr_key(key): key.endswith('.zattrs')) out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} - mapping[out_key] = json.dumps(out) + mapping[out_key] = json.dumps(out).encode() diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index e9565caa13..9e401eed69 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -92,6 +92,14 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): + try: + import json + metadata = json.loads(store['.zmetadata']) + meta_store = {k: v.encode() for k, v in metadata.items()} + chunk_store, store = store, meta_store + except (KeyError, ValueError, json.JSONDecodeError): + pass + self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) From 5da425fb2ffc4fcba254b88ef5e5949e84efa19b Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:26:14 -0400 Subject: [PATCH 15/35] Implement ConsolidatedMetadataStore --- zarr/convenience.py | 17 ++++++++- zarr/hierarchy.py | 9 ----- zarr/storage.py | 64 ++++++++++++++++++++++++++++++++++ zarr/tests/test_convenience.py | 11 +++--- 4 files changed, 86 insertions(+), 15 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 7aac2e385c..db3fa2f85a 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1073,7 +1073,16 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, def consolidate_metadata(mapping, out_key='.zmetadata'): """ - Read all the metadata in the files within the given dataset and join + Store all the metadata in the files within the given dataset in one key + + This produces a single file in the backend store, containing all the + metadata read from all the zarr-related keys that can be found. This + should be used in conjunction with ``storage.ConsolidatedMetadataStore`` + to reduce the number of operations on the backend store at read time. + + Note, however, that if the dataset is changed after this consolidation, + then the metadata read by ``storage.ConsolidatedMetadataStore`` would + be out of sync with reality unless this function is called again. Parameters ---------- @@ -1081,8 +1090,13 @@ def consolidate_metadata(mapping, out_key='.zmetadata'): Containing metadata and data keys of a zarr dataset out_key : str Key to place the consolidated data into + + Returns + ------- + ConsolidatedMetadataStore instance, based on the same base store. """ import json + from .storage import ConsolidatedMetadataStore def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or @@ -1090,3 +1104,4 @@ def is_zarr_key(key): out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} mapping[out_key] = json.dumps(out).encode() + return ConsolidatedMetadataStore(mapping, out_key) diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index 9e401eed69..f20b899b2b 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -91,15 +91,6 @@ class Group(MutableMapping): def __init__(self, store, path=None, read_only=False, chunk_store=None, cache_attrs=True, synchronizer=None): - - try: - import json - metadata = json.loads(store['.zmetadata']) - meta_store = {k: v.encode() for k, v in metadata.items()} - chunk_store, store = store, meta_store - except (KeyError, ValueError, json.JSONDecodeError): - pass - self._store = store self._chunk_store = chunk_store self._path = normalize_storage_path(path) diff --git a/zarr/storage.py b/zarr/storage.py index 173325e23a..2d92ff4a78 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -24,6 +24,7 @@ import atexit import re import sys +import json import multiprocessing from threading import Lock, RLock import glob @@ -1891,3 +1892,66 @@ def __delitem__(self, key): with self._mutex: self._invalidate_keys() self._invalidate_value(key) + + +class ConsolidatedMetadataStore(MutableMapping): + """A layer over other storage, with the metadata within a single key + + The purpose of this class, is to be able to get all of the metadata for + a given dataset in a single read operation from the underlying storage. + See ``convenience.consolidate_metadata()`` for how to create this single + metadata key. + + This class loads from the one key, and stores the data in a dict, so that + accessing the keys no longer requires operations on the backend store. + + This class is read-only, and attempts to change the dataset metadata will + fail, but changing the data is possible. If the backend storage is changed + directly, then the metadata stored here could become obsolete, and + ``conslidate_metadata`` should be called again and the class re-invoked. + The use case is for write once, read many times. + + """ + def __init__(self, store, metadata_key='.zmetadata'): + """ + + Parameters + ---------- + store: MutableMapping + Containing the zarr dataset + metadata_key: str + The target in the store where all of the metadata are stores. We + assume JSON encoding. + """ + self.store = store + metadata = json.loads(store[metadata_key]) + self.meta_store = {k: v.encode() for k, v in metadata.items()} + + def __getitem__(self, key): + """Try local dict before falling back to real storage""" + try: + return self.meta_store[key] + except KeyError: + return self.store[key] + + def __iter__(self): + """Only list local keys - data must be got via getitem""" + return iter(self.meta_store) + + def __len__(self): + """Only len of local keys""" + return len(self.meta_store) + + def __delitem__(self, key): + """Data can be deleted from storage""" + if key not in self: + del self.store[key] + else: + raise NotImplementedError + + def __setitem__(self, key, value): + """Data can be written to storage""" + if key not in self: + self.store[key] = value + else: + raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index cae105c23e..92984f95c1 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -92,7 +92,7 @@ def test_lazy_loader(): def test_consolidate_metadata(): - import json + from zarr.storage import ConsolidatedMetadataStore store = DictStore() z = group(store) z.create_group('g1') @@ -101,7 +101,8 @@ def test_consolidate_metadata(): arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') arr.attrs['data'] = 1 arr[:] = 1.0 - consolidate_metadata(store) + out = consolidate_metadata(store) + assert isinstance(out, ConsolidatedMetadataStore) assert '.zmetadata' in store for key in ['.zgroup', 'g1/.zgroup', @@ -110,13 +111,13 @@ def test_consolidate_metadata(): 'g2/arr/.zarray', 'g2/arr/.zattrs']: del store[key] - meta = json.loads(store['.zmetadata']) - meta = {k: v.encode() for k, v in meta.items()} - z2 = group(meta, chunk_store=store) + cstore = ConsolidatedMetadataStore(store) + z2 = open(cstore, mode='r') assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() + assert list(out) class TestCopyStore(unittest.TestCase): From e62d39c29d5059100d7eb08e2d50e8c3f906cd6f Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 12:41:07 -0400 Subject: [PATCH 16/35] fix for py34 py35 --- zarr/storage.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 2d92ff4a78..1b2c97e4f1 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1920,11 +1920,15 @@ def __init__(self, store, metadata_key='.zmetadata'): store: MutableMapping Containing the zarr dataset metadata_key: str - The target in the store where all of the metadata are stores. We + The target in the store where all of the metadata are stored. We assume JSON encoding. """ self.store = store - metadata = json.loads(store[metadata_key]) + if sys.version_info.major == 3 and sys.version_info.minor < 6: + d = store[metadata_key].decode() + else: + d = store[metadata_key] + metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} def __getitem__(self, key): From 01e815a9b2a5310476b0e974a2da8fc5e12b2f05 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 13:34:24 -0400 Subject: [PATCH 17/35] improve coverage; data write in consolidated store --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index 1b2c97e4f1..ad6e3990c4 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1948,14 +1948,14 @@ def __len__(self): def __delitem__(self, key): """Data can be deleted from storage""" - if key not in self: + if key not in self.meta_store: del self.store[key] else: raise NotImplementedError def __setitem__(self, key, value): """Data can be written to storage""" - if key not in self: + if key not in self.meta_store: self.store[key] = value else: raise NotImplementedError diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 92984f95c1..379a039e01 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -112,12 +112,18 @@ def test_consolidate_metadata(): 'g2/arr/.zattrs']: del store[key] cstore = ConsolidatedMetadataStore(store) - z2 = open(cstore, mode='r') + z2 = open(cstore) assert list(z2) == ['g1', 'g2'] assert z2.g2.attrs['hello'] == 'world' assert z2.g2.arr.attrs['data'] == 1 assert (z2.g2.arr[:] == 1.0).all() - assert list(out) + assert list(out) == list(cstore) + + # tests del/write on the store + del cstore['g2/arr/0.0'] + assert (z2.g2.arr[:] == 0).all() + z2.g2.arr[:] = 2 + assert (z2.g2.arr[:] == 2).all() class TestCopyStore(unittest.TestCase): From 1561eaded817632f45a897bc21241d3967dc25b0 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 2 Aug 2018 15:37:59 -0400 Subject: [PATCH 18/35] coverage --- zarr/storage.py | 4 ++-- zarr/tests/test_convenience.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/zarr/storage.py b/zarr/storage.py index ad6e3990c4..290bb8f0c9 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1925,8 +1925,8 @@ def __init__(self, store, metadata_key='.zmetadata'): """ self.store = store if sys.version_info.major == 3 and sys.version_info.minor < 6: - d = store[metadata_key].decode() - else: + d = store[metadata_key].decode() # pragma: no cover + else: # pragma: no cover d = store[metadata_key] metadata = json.loads(d) self.meta_store = {k: v.encode() for k, v in metadata.items()} diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 379a039e01..62da7d4b77 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -120,6 +120,10 @@ def test_consolidate_metadata(): assert list(out) == list(cstore) # tests del/write on the store + with pytest.raises(NotImplementedError): + del cstore['.zgroup'] + with pytest.raises(NotImplementedError): + cstore['.zgroup'] = None del cstore['g2/arr/0.0'] assert (z2.g2.arr[:] == 0).all() z2.g2.arr[:] = 2 From 03d1dbcae3cdf993bef59c7e110774ba60be36c1 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 23:06:57 +0100 Subject: [PATCH 19/35] doc and param style --- zarr/convenience.py | 30 +++++++++++++++++------------- zarr/storage.py | 22 ++++++++++------------ 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index db3fa2f85a..f651f67260 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1071,37 +1071,41 @@ def copy_all(source, dest, shallow=False, without_attrs=False, log=None, return n_copied, n_skipped, n_bytes_copied -def consolidate_metadata(mapping, out_key='.zmetadata'): +def consolidate_metadata(store, metadata_key='.zmetadata'): """ - Store all the metadata in the files within the given dataset in one key + Consolidate all metadata for groups and arrays within the given store + into a single resource and put it under the given key. - This produces a single file in the backend store, containing all the + This produces a single object in the backend store, containing all the metadata read from all the zarr-related keys that can be found. This should be used in conjunction with ``storage.ConsolidatedMetadataStore`` to reduce the number of operations on the backend store at read time. - Note, however, that if the dataset is changed after this consolidation, - then the metadata read by ``storage.ConsolidatedMetadataStore`` would - be out of sync with reality unless this function is called again. + Note, however, that if any metadata in the store is changed after this + consolidation, then the metadata read by ``storage.ConsolidatedMetadataStore`` + would be out of sync with reality unless this function is called again. Parameters ---------- - mapping : MutableMapping instance - Containing metadata and data keys of a zarr dataset - out_key : str - Key to place the consolidated data into + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + metadata_key : str + Key to put the consolidated metadata under. Returns ------- ConsolidatedMetadataStore instance, based on the same base store. + """ import json from .storage import ConsolidatedMetadataStore + store = normalize_store_arg(store) + def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or key.endswith('.zattrs')) - out = {key: mapping[key].decode() for key in mapping if is_zarr_key(key)} - mapping[out_key] = json.dumps(out).encode() - return ConsolidatedMetadataStore(mapping, out_key) + out = {key: store[key].decode() for key in store if is_zarr_key(key)} + store[metadata_key] = json.dumps(out).encode() + return ConsolidatedMetadataStore(store, metadata_key) diff --git a/zarr/storage.py b/zarr/storage.py index 290bb8f0c9..91ebd8e382 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1895,7 +1895,7 @@ def __delitem__(self, key): class ConsolidatedMetadataStore(MutableMapping): - """A layer over other storage, with the metadata within a single key + """A layer over other storage, with the metadata within a single key. The purpose of this class, is to be able to get all of the metadata for a given dataset in a single read operation from the underlying storage. @@ -1908,21 +1908,19 @@ class ConsolidatedMetadataStore(MutableMapping): This class is read-only, and attempts to change the dataset metadata will fail, but changing the data is possible. If the backend storage is changed directly, then the metadata stored here could become obsolete, and - ``conslidate_metadata`` should be called again and the class re-invoked. + ``consolidate_metadata`` should be called again and the class re-invoked. The use case is for write once, read many times. + Parameters + ---------- + store: MutableMapping + Containing the zarr dataset. + metadata_key: str + The target in the store where all of the metadata are stored. We + assume JSON encoding. + """ def __init__(self, store, metadata_key='.zmetadata'): - """ - - Parameters - ---------- - store: MutableMapping - Containing the zarr dataset - metadata_key: str - The target in the store where all of the metadata are stored. We - assume JSON encoding. - """ self.store = store if sys.version_info.major == 3 and sys.version_info.minor < 6: d = store[metadata_key].decode() # pragma: no cover From 4e555488e01eff63786631bb63f8c90583c8826f Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 18 Oct 2018 23:27:44 +0100 Subject: [PATCH 20/35] add test for nchunks_initialized --- zarr/tests/test_convenience.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 62da7d4b77..7eb9626405 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -98,9 +98,12 @@ def test_consolidate_metadata(): z.create_group('g1') g2 = z.create_group('g2') g2.attrs['hello'] = 'world' - arr = g2.create_dataset('arr', shape=(20, 20), dtype='f8') + arr = g2.create_dataset('arr', shape=(20, 20), chunks=(5, 5), dtype='f8') + assert 16 == arr.nchunks + assert 0 == arr.nchunks_initialized arr.attrs['data'] = 1 arr[:] = 1.0 + assert 16 == arr.nchunks_initialized out = consolidate_metadata(store) assert isinstance(out, ConsolidatedMetadataStore) assert '.zmetadata' in store @@ -113,10 +116,12 @@ def test_consolidate_metadata(): del store[key] cstore = ConsolidatedMetadataStore(store) z2 = open(cstore) - assert list(z2) == ['g1', 'g2'] - assert z2.g2.attrs['hello'] == 'world' - assert z2.g2.arr.attrs['data'] == 1 + assert ['g1', 'g2'] == list(z2) + assert 'world' == z2.g2.attrs['hello'] + assert 1 == z2.g2.arr.attrs['data'] assert (z2.g2.arr[:] == 1.0).all() + assert 16 == z2.g2.arr.nchunks + assert 16 == z2.g2.arr.nchunks_initialized assert list(out) == list(cstore) # tests del/write on the store From c283487acdbdde06a71745dce1793438ee2c56af Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 00:10:36 +0100 Subject: [PATCH 21/35] expose chunk_store param in open* functions --- zarr/creation.py | 18 ++++++++++++------ zarr/hierarchy.py | 15 ++++++++++----- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/zarr/creation.py b/zarr/creation.py index 49b4a9d2ea..35cb0cf8c0 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -349,7 +349,7 @@ def array(data, **kwargs): def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor='default', fill_value=0, order='C', synchronizer=None, filters=None, cache_metadata=True, cache_attrs=True, path=None, object_codec=None, - **kwargs): + chunk_store=None, **kwargs): """Open an array using file-mode-like semantics. Parameters @@ -391,6 +391,8 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= Array path within store. object_codec : Codec, optional A codec to encode object arrays, only needed if dtype=object. + chunk_store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. Returns ------- @@ -426,7 +428,10 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= # a : read/write if exists, create otherwise (default) # handle polymorphic store arg - store = normalize_store_arg(store, clobber=(mode == 'w')) + clobber = mode == 'w' + store = normalize_store_arg(store, clobber=clobber) + if chunk_store is not None: + chunk_store = normalize_store_arg(chunk_store, clobber=clobber) path = normalize_storage_path(path) # API compatibility with h5py @@ -448,7 +453,7 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, overwrite=True, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) elif mode == 'a': if contains_group(store, path=path): @@ -457,7 +462,7 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) elif mode in ['w-', 'x']: if contains_group(store, path=path): @@ -468,14 +473,15 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor= init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor, fill_value=fill_value, order=order, filters=filters, path=path, - object_codec=object_codec) + object_codec=object_codec, chunk_store=chunk_store) # determine read only status read_only = mode == 'r' # instantiate array z = Array(store, read_only=read_only, synchronizer=synchronizer, - cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path) + cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path, + chunk_store=chunk_store) return z diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index f20b899b2b..b7359dafa7 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -1058,7 +1058,8 @@ def group(store=None, overwrite=False, chunk_store=None, cache_attrs=cache_attrs, synchronizer=synchronizer, path=path) -def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): +def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None, + chunk_store=None): """Open a group using file-mode-like semantics. Parameters @@ -1078,6 +1079,8 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): Array synchronizer. path : string, optional Group path within store. + chunk_store : MutableMapping or string, optional + Store or path to directory in file system or name of zip file. Returns ------- @@ -1101,6 +1104,8 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): # handle polymorphic store arg store = _normalize_store_arg(store) + if chunk_store is not None: + chunk_store = _normalize_store_arg(chunk_store) path = normalize_storage_path(path) # ensure store is initialized @@ -1112,13 +1117,13 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): err_group_not_found(path) elif mode == 'w': - init_group(store, overwrite=True, path=path) + init_group(store, overwrite=True, path=path, chunk_store=chunk_store) elif mode == 'a': if contains_array(store, path=path): err_contains_array(path) if not contains_group(store, path=path): - init_group(store, path=path) + init_group(store, path=path, chunk_store=chunk_store) elif mode in ['w-', 'x']: if contains_array(store, path=path): @@ -1126,10 +1131,10 @@ def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None): elif contains_group(store, path=path): err_contains_group(path) else: - init_group(store, path=path) + init_group(store, path=path, chunk_store=chunk_store) # determine read only status read_only = mode == 'r' return Group(store, read_only=read_only, cache_attrs=cache_attrs, - synchronizer=synchronizer, path=path) + synchronizer=synchronizer, path=path, chunk_store=chunk_store) From cc9d7c774ab7c2c8aab8cd00591b47d46743a08d Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 00:11:36 +0100 Subject: [PATCH 22/35] implement open_consolidated --- zarr/convenience.py | 23 ++++++++++++++++++-- zarr/storage.py | 29 +++++++++++-------------- zarr/tests/test_convenience.py | 39 +++++++++++++++++++++++++--------- 3 files changed, 62 insertions(+), 29 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index f651f67260..9bc8be4438 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -68,7 +68,8 @@ def open(store, mode='a', **kwargs): path = kwargs.get('path', None) # handle polymorphic store arg - store = normalize_store_arg(store, clobber=(mode == 'w')) + clobber = mode == 'w' + store = normalize_store_arg(store, clobber=clobber) path = normalize_storage_path(path) if mode in {'w', 'w-', 'x'}: @@ -1108,4 +1109,22 @@ def is_zarr_key(key): out = {key: store[key].decode() for key in store if is_zarr_key(key)} store[metadata_key] = json.dumps(out).encode() - return ConsolidatedMetadataStore(store, metadata_key) + return ConsolidatedMetadataStore(store, metadata_key=metadata_key) + + +def open_consolidated(store, metadata_key='.zmetadata', mode='r'): + """TODO doc me""" + + from .storage import ConsolidatedMetadataStore + + # normalize parameters + store = normalize_store_arg(store) + if mode not in 'ra': + raise ValueError("invalid mode, expected either 'r' or 'a'; found {!r}" + .format(mode)) + + # setup metadata sotre + meta_store = ConsolidatedMetadataStore(store, metadata_key=metadata_key) + + # pass through + return open(store=meta_store, chunk_store=store, mode=mode) diff --git a/zarr/storage.py b/zarr/storage.py index 91ebd8e382..06d2232d9f 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1930,30 +1930,25 @@ def __init__(self, store, metadata_key='.zmetadata'): self.meta_store = {k: v.encode() for k, v in metadata.items()} def __getitem__(self, key): - """Try local dict before falling back to real storage""" - try: - return self.meta_store[key] - except KeyError: - return self.store[key] + return self.meta_store[key] + + def __contains__(self, item): + return item in self.meta_store def __iter__(self): - """Only list local keys - data must be got via getitem""" return iter(self.meta_store) def __len__(self): - """Only len of local keys""" return len(self.meta_store) def __delitem__(self, key): - """Data can be deleted from storage""" - if key not in self.meta_store: - del self.store[key] - else: - raise NotImplementedError + raise PermissionError def __setitem__(self, key, value): - """Data can be written to storage""" - if key not in self.meta_store: - self.store[key] = value - else: - raise NotImplementedError + raise PermissionError + + def getsize(self, path): + return getsize(self.meta_store, path) + + def listdir(self, path): + return listdir(self.meta_store, path) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 7eb9626405..4eebc97aea 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -12,7 +12,8 @@ import pytest -from zarr.convenience import open, save, save_group, load, copy_store, copy, consolidate_metadata +from zarr.convenience import (open, save, save_group, load, copy_store, copy, + consolidate_metadata, open_consolidated) from zarr.storage import atexit_rmtree, DictStore from zarr.core import Array from zarr.hierarchy import Group, group @@ -93,6 +94,8 @@ def test_lazy_loader(): def test_consolidate_metadata(): from zarr.storage import ConsolidatedMetadataStore + + # setup initial data store = DictStore() z = group(store) z.create_group('g1') @@ -104,6 +107,8 @@ def test_consolidate_metadata(): arr.attrs['data'] = 1 arr[:] = 1.0 assert 16 == arr.nchunks_initialized + + # perform consolidation out = consolidate_metadata(store) assert isinstance(out, ConsolidatedMetadataStore) assert '.zmetadata' in store @@ -114,23 +119,37 @@ def test_consolidate_metadata(): 'g2/arr/.zarray', 'g2/arr/.zattrs']: del store[key] - cstore = ConsolidatedMetadataStore(store) - z2 = open(cstore) + + # open consolidated + z2 = open_consolidated(store, mode='a') assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] assert (z2.g2.arr[:] == 1.0).all() assert 16 == z2.g2.arr.nchunks assert 16 == z2.g2.arr.nchunks_initialized - assert list(out) == list(cstore) # tests del/write on the store - with pytest.raises(NotImplementedError): - del cstore['.zgroup'] - with pytest.raises(NotImplementedError): - cstore['.zgroup'] = None - del cstore['g2/arr/0.0'] - assert (z2.g2.arr[:] == 0).all() + with pytest.raises(PermissionError): + del out['.zgroup'] + with pytest.raises(PermissionError): + out['.zgroup'] = None + + # test new metadata are not writeable + with pytest.raises(PermissionError): + z2.create_group('g3') + with pytest.raises(PermissionError): + z2.create_dataset('spam', shape=42, chunks=7, dtype='i4') + with pytest.raises(PermissionError): + del z2['g2'] + + # test consolidated metadata are not writeable + with pytest.raises(PermissionError): + z2.g2.attrs['hello'] = 'universe' + with pytest.raises(PermissionError): + z2.g2.arr.attrs['foo'] = 'bar' + + # test the data are writeable z2.g2.arr[:] = 2 assert (z2.g2.arr[:] == 2).all() From a14b045237f046a5f55d09645edf96dc81dad2bd Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 00:34:04 +0100 Subject: [PATCH 23/35] tweaks to consolidated behaviour --- zarr/__init__.py | 3 ++- zarr/convenience.py | 6 +++--- zarr/creation.py | 10 +++++----- zarr/hierarchy.py | 4 ++-- zarr/storage.py | 4 ++-- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/zarr/__init__.py b/zarr/__init__.py index 56d060fdac..cf34d3d427 100644 --- a/zarr/__init__.py +++ b/zarr/__init__.py @@ -12,6 +12,7 @@ from zarr.sync import ThreadSynchronizer, ProcessSynchronizer from zarr.codecs import * from zarr.convenience import (open, save, save_array, save_group, load, copy_store, - copy, copy_all, tree) + copy, copy_all, tree, consolidate_metadata, + open_consolidated) from zarr.errors import CopyError, MetadataError, PermissionError from zarr.version import version as __version__ diff --git a/zarr/convenience.py b/zarr/convenience.py index 9bc8be4438..7bc66f98d8 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -18,12 +18,12 @@ # noinspection PyShadowingBuiltins -def open(store, mode='a', **kwargs): +def open(store=None, mode='a', **kwargs): """Convenience function to open a group or array using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means @@ -1112,7 +1112,7 @@ def is_zarr_key(key): return ConsolidatedMetadataStore(store, metadata_key=metadata_key) -def open_consolidated(store, metadata_key='.zmetadata', mode='r'): +def open_consolidated(store, metadata_key='.zmetadata', mode='a'): """TODO doc me""" from .storage import ConsolidatedMetadataStore diff --git a/zarr/creation.py b/zarr/creation.py index 35cb0cf8c0..0184a4a5da 100644 --- a/zarr/creation.py +++ b/zarr/creation.py @@ -346,15 +346,15 @@ def array(data, **kwargs): return z -def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor='default', - fill_value=0, order='C', synchronizer=None, filters=None, - cache_metadata=True, cache_attrs=True, path=None, object_codec=None, - chunk_store=None, **kwargs): +def open_array(store=None, mode='a', shape=None, chunks=True, dtype=None, + compressor='default', fill_value=0, order='C', synchronizer=None, + filters=None, cache_metadata=True, cache_attrs=True, path=None, + object_codec=None, chunk_store=None, **kwargs): """Open an array using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means diff --git a/zarr/hierarchy.py b/zarr/hierarchy.py index b7359dafa7..17821130eb 100644 --- a/zarr/hierarchy.py +++ b/zarr/hierarchy.py @@ -1058,13 +1058,13 @@ def group(store=None, overwrite=False, chunk_store=None, cache_attrs=cache_attrs, synchronizer=synchronizer, path=path) -def open_group(store, mode='a', cache_attrs=True, synchronizer=None, path=None, +def open_group(store=None, mode='a', cache_attrs=True, synchronizer=None, path=None, chunk_store=None): """Open a group using file-mode-like semantics. Parameters ---------- - store : MutableMapping or string + store : MutableMapping or string, optional Store or path to directory in file system or name of zip file. mode : {'r', 'r+', 'a', 'w', 'w-'}, optional Persistence mode: 'r' means read only (must exist); 'r+' means diff --git a/zarr/storage.py b/zarr/storage.py index 06d2232d9f..a86c7dfc05 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1942,10 +1942,10 @@ def __len__(self): return len(self.meta_store) def __delitem__(self, key): - raise PermissionError + err_read_only() def __setitem__(self, key, value): - raise PermissionError + err_read_only() def getsize(self, path): return getsize(self.meta_store, path) From 0cbda1538bf3f8fc50a174f46e55a1ffb99aa4d7 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 19 Oct 2018 00:54:46 +0100 Subject: [PATCH 24/35] py2 fix --- zarr/tests/test_convenience.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 4eebc97aea..b7da890522 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -17,7 +17,7 @@ from zarr.storage import atexit_rmtree, DictStore from zarr.core import Array from zarr.hierarchy import Group, group -from zarr.errors import CopyError +from zarr.errors import CopyError, PermissionError def test_open_array(): From b4b60aa591f8ce9205e76a9382c5e94aaa16a363 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 23 Oct 2018 10:03:04 -0400 Subject: [PATCH 25/35] Update docstrings --- zarr/convenience.py | 41 +++++++++++++++++++++++++++------- zarr/tests/test_convenience.py | 7 +++--- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 7bc66f98d8..d45dadc715 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1080,11 +1080,13 @@ def consolidate_metadata(store, metadata_key='.zmetadata'): This produces a single object in the backend store, containing all the metadata read from all the zarr-related keys that can be found. This should be used in conjunction with ``storage.ConsolidatedMetadataStore`` - to reduce the number of operations on the backend store at read time. + to reduce the number of operations on the backend store at read time; + normally, users will call ``open_consolidated()`` to open in optimised, + read-only mode. - Note, however, that if any metadata in the store is changed after this - consolidation, then the metadata read by ``storage.ConsolidatedMetadataStore`` - would be out of sync with reality unless this function is called again. + Note, that if the metadata in the store is changed after this + consolidation, then the metadata read by ``open_consolidated()`` + would be incorrect unless this function is called again. Parameters ---------- @@ -1095,11 +1097,10 @@ def consolidate_metadata(store, metadata_key='.zmetadata'): Returns ------- - ConsolidatedMetadataStore instance, based on the same base store. + Group instance, opened with the new consolidated metadata """ import json - from .storage import ConsolidatedMetadataStore store = normalize_store_arg(store) @@ -1109,11 +1110,35 @@ def is_zarr_key(key): out = {key: store[key].decode() for key in store if is_zarr_key(key)} store[metadata_key] = json.dumps(out).encode() - return ConsolidatedMetadataStore(store, metadata_key=metadata_key) + return open_consolidated(store, metadata_key=metadata_key) def open_consolidated(store, metadata_key='.zmetadata', mode='a'): - """TODO doc me""" + """Open group using metadata consolidated into a single key + + This is an optimised method for opening a Zarr group, where instead of + traversing the group/array hierarchy by accessing the metadata keys at + each level, a single key contains all of the metadata for everything. + For remote data sources where the overhead of accessing a key is large + compared to the time to read data. + + The group accessed must have already had its metadata consolidated into a + single key using the function ``consolidate_metadata()``. + + This optimised method only works in modes which do not change the + metadata, although the data may still be written/updated. + + Parameters + ---------- + store : MutableMapping or string + Store or path to directory in file system or name of zip file. + metadata_key : str + Key to read the consolidated metadata from. The default (.zmetadata) + corresponds to the default used by ``consolidate_metadata()``. + mode : {'r', 'a'}, optional + Persistence mode. Only modes which cannot change the metadata are + allowed. + """ from .storage import ConsolidatedMetadataStore diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index b7da890522..07e4451f45 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -110,7 +110,7 @@ def test_consolidate_metadata(): # perform consolidation out = consolidate_metadata(store) - assert isinstance(out, ConsolidatedMetadataStore) + assert isinstance(out, Group) assert '.zmetadata' in store for key in ['.zgroup', 'g1/.zgroup', @@ -130,10 +130,11 @@ def test_consolidate_metadata(): assert 16 == z2.g2.arr.nchunks_initialized # tests del/write on the store + cmd = ConsolidatedMetadataStore(store) with pytest.raises(PermissionError): - del out['.zgroup'] + del cmd['.zgroup'] with pytest.raises(PermissionError): - out['.zgroup'] = None + cmd['.zgroup'] = None # test new metadata are not writeable with pytest.raises(PermissionError): From cae30daa6d735e36aecb5da72a9d082b938ab105 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 31 Oct 2018 17:00:07 -0500 Subject: [PATCH 26/35] added api docs; consistify references in docstrings --- docs/api/convenience.rst | 2 ++ docs/api/storage.rst | 2 ++ zarr/convenience.py | 61 +++++++++++++++++++++++++++------------- zarr/storage.py | 19 +++++++++---- 4 files changed, 60 insertions(+), 24 deletions(-) diff --git a/docs/api/convenience.rst b/docs/api/convenience.rst index 51997a4dc2..a70a90ce7c 100644 --- a/docs/api/convenience.rst +++ b/docs/api/convenience.rst @@ -10,3 +10,5 @@ Convenience functions (``zarr.convenience``) .. autofunction:: copy_all .. autofunction:: copy_store .. autofunction:: tree +.. autofunction:: consolidate_metadata +.. autofunction:: open_consolidated diff --git a/docs/api/storage.rst b/docs/api/storage.rst index 2365359fa9..74801d3115 100644 --- a/docs/api/storage.rst +++ b/docs/api/storage.rst @@ -27,6 +27,8 @@ Storage (``zarr.storage``) .. automethod:: invalidate_values .. automethod:: invalidate_keys +.. autoclass:: ConsolidatedMetadataStore + .. autofunction:: init_array .. autofunction:: init_group .. autofunction:: contains_array diff --git a/zarr/convenience.py b/zarr/convenience.py index d45dadc715..0f4dfa9094 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -31,12 +31,17 @@ def open(store=None, mode='a', **kwargs): exist); 'w' means create (overwrite if exists); 'w-' means create (fail if exists). **kwargs - Additional parameters are passed through to :func:`zarr.open_array` or - :func:`zarr.open_group`. + Additional parameters are passed through to :func:`zarr.creation.open_array` or + :func:`zarr.hierarchy.open_group`. + + Returns + ------- + z : :class:`zarr.core.Array` or :class:`zarr.hierarchy.Group` + Array or group, depending on what exists in the given store. See Also -------- - zarr.open_array, zarr.open_group + zarr.creation.open_array, zarr.hierarchy.open_group Examples -------- @@ -1078,16 +1083,17 @@ def consolidate_metadata(store, metadata_key='.zmetadata'): into a single resource and put it under the given key. This produces a single object in the backend store, containing all the - metadata read from all the zarr-related keys that can be found. This - should be used in conjunction with ``storage.ConsolidatedMetadataStore`` - to reduce the number of operations on the backend store at read time; - normally, users will call ``open_consolidated()`` to open in optimised, - read-only mode. + metadata read from all the zarr-related keys that can be found. After + metadata have been consolidated, use :func:`open_consolidated` to open + the root group in optimised, read-only mode, using the consolidated + metadata to reduce the number of read operations on the backend store. Note, that if the metadata in the store is changed after this - consolidation, then the metadata read by ``open_consolidated()`` + consolidation, then the metadata read by :func:`open_consolidated` would be incorrect unless this function is called again. + .. note:: This is an experimental feature. + Parameters ---------- store : MutableMapping or string @@ -1097,7 +1103,12 @@ def consolidate_metadata(store, metadata_key='.zmetadata'): Returns ------- - Group instance, opened with the new consolidated metadata + g : :class:`zarr.hierarchy.Group` + Group instance, opened with the new consolidated metadata. + + See Also + -------- + open_consolidated """ import json @@ -1113,8 +1124,8 @@ def is_zarr_key(key): return open_consolidated(store, metadata_key=metadata_key) -def open_consolidated(store, metadata_key='.zmetadata', mode='a'): - """Open group using metadata consolidated into a single key +def open_consolidated(store, metadata_key='.zmetadata', mode='r+'): + """Open group using metadata previously consolidated into a single key. This is an optimised method for opening a Zarr group, where instead of traversing the group/array hierarchy by accessing the metadata keys at @@ -1123,7 +1134,7 @@ def open_consolidated(store, metadata_key='.zmetadata', mode='a'): compared to the time to read data. The group accessed must have already had its metadata consolidated into a - single key using the function ``consolidate_metadata()``. + single key using the function :func:`consolidate_metadata`. This optimised method only works in modes which do not change the metadata, although the data may still be written/updated. @@ -1134,18 +1145,30 @@ def open_consolidated(store, metadata_key='.zmetadata', mode='a'): Store or path to directory in file system or name of zip file. metadata_key : str Key to read the consolidated metadata from. The default (.zmetadata) - corresponds to the default used by ``consolidate_metadata()``. - mode : {'r', 'a'}, optional - Persistence mode. Only modes which cannot change the metadata are - allowed. + corresponds to the default used by :func:`consolidate_metadata`. + mode : {'r', 'r+'}, optional + Persistence mode: 'r' means read only (must exist); 'r+' means + read/write (must exist) although only writes to data are allowed, + changes to metadata including creation of new arrays or group + are not allowed. + + Returns + ------- + g : :class:`zarr.hierarchy.Group` + Group instance, opened with the consolidated metadata. + + See Also + -------- + consolidate_metadata + """ from .storage import ConsolidatedMetadataStore # normalize parameters store = normalize_store_arg(store) - if mode not in 'ra': - raise ValueError("invalid mode, expected either 'r' or 'a'; found {!r}" + if mode not in {'r', 'r+'}: + raise ValueError("invalid mode, expected either 'r' or 'r+'; found {!r}" .format(mode)) # setup metadata sotre diff --git a/zarr/storage.py b/zarr/storage.py index a86c7dfc05..f79c313cdf 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -1895,12 +1895,13 @@ def __delitem__(self, key): class ConsolidatedMetadataStore(MutableMapping): - """A layer over other storage, with the metadata within a single key. + """A layer over other storage, where the metadata has been consolidated into + a single key. The purpose of this class, is to be able to get all of the metadata for a given dataset in a single read operation from the underlying storage. - See ``convenience.consolidate_metadata()`` for how to create this single - metadata key. + See :func:`zarr.convenience.consolidate_metadata` for how to create this + single metadata key. This class loads from the one key, and stores the data in a dict, so that accessing the keys no longer requires operations on the backend store. @@ -1908,8 +1909,12 @@ class ConsolidatedMetadataStore(MutableMapping): This class is read-only, and attempts to change the dataset metadata will fail, but changing the data is possible. If the backend storage is changed directly, then the metadata stored here could become obsolete, and - ``consolidate_metadata`` should be called again and the class re-invoked. - The use case is for write once, read many times. + :func:`zarr.convenience.consolidate_metadata` should be called again and the class + re-invoked. The use case is for write once, read many times. + + .. versionadded:: 2.3 + + .. note:: This is an experimental feature. Parameters ---------- @@ -1919,6 +1924,10 @@ class ConsolidatedMetadataStore(MutableMapping): The target in the store where all of the metadata are stored. We assume JSON encoding. + See Also + -------- + zarr.convenience.consolidate_metadata, zarr.convenience.open_consolidated + """ def __init__(self, store, metadata_key='.zmetadata'): self.store = store From ba99cfaea09d41f714f991fe097eed13d301a3ec Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 09:00:40 -0500 Subject: [PATCH 27/35] add tests --- zarr/tests/test_convenience.py | 14 +++++++++++--- zarr/tests/test_creation.py | 9 +++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 07e4451f45..91a3418a96 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -14,7 +14,7 @@ from zarr.convenience import (open, save, save_group, load, copy_store, copy, consolidate_metadata, open_consolidated) -from zarr.storage import atexit_rmtree, DictStore +from zarr.storage import atexit_rmtree, DictStore, getsize, ConsolidatedMetadataStore from zarr.core import Array from zarr.hierarchy import Group, group from zarr.errors import CopyError, PermissionError @@ -93,7 +93,6 @@ def test_lazy_loader(): def test_consolidate_metadata(): - from zarr.storage import ConsolidatedMetadataStore # setup initial data store = DictStore() @@ -121,7 +120,7 @@ def test_consolidate_metadata(): del store[key] # open consolidated - z2 = open_consolidated(store, mode='a') + z2 = open_consolidated(store, mode='r+') assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] @@ -136,6 +135,9 @@ def test_consolidate_metadata(): with pytest.raises(PermissionError): cmd['.zgroup'] = None + # test getsize on the store + assert getsize(cmd) == getsize(store) + # test new metadata are not writeable with pytest.raises(PermissionError): z2.create_group('g3') @@ -154,6 +156,12 @@ def test_consolidate_metadata(): z2.g2.arr[:] = 2 assert (z2.g2.arr[:] == 2).all() + # test invalid modes + with pytest.raises(ValueError): + open_consolidated(store, mode='a') + with pytest.raises(ValueError): + open_consolidated(store, mode='w') + class TestCopyStore(unittest.TestCase): diff --git a/zarr/tests/test_creation.py b/zarr/tests/test_creation.py index 304714991e..ef2232c234 100644 --- a/zarr/tests/test_creation.py +++ b/zarr/tests/test_creation.py @@ -3,6 +3,7 @@ import tempfile import shutil import atexit +import os.path import numpy as np @@ -240,6 +241,14 @@ def test_open_array(): assert isinstance(z, Array) assert 'foo/bar' == z.path + # with chunk store + meta_store = 'data/meta.zarr' + chunk_store = 'data/chunks.zarr' + z = open_array(store=meta_store, chunk_store=chunk_store, shape=11, mode='w') + z[:] = 42 + assert os.path.abspath(meta_store) == z.store.path + assert os.path.abspath(chunk_store) == z.chunk_store.path + def test_empty_like(): From 6f01dece292e8bb0fd6e0a01b5178dcd1887518d Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 1 Nov 2018 13:25:27 -0400 Subject: [PATCH 28/35] Add section to tutorial, add to release notes --- docs/release.rst | 9 +++++++++ docs/tutorial.rst | 27 +++++++++++++++++++++++++++ zarr/tests/test_convenience.py | 2 +- 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/docs/release.rst b/docs/release.rst index 9acab25fde..a2428e990e 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -6,6 +6,15 @@ Release notes 2.3.0 (Work in Progress) ------------------------ +Enhancements +~~~~~~~~~~~~ + +* Add "consolidated" metadata as an experimental option: use :func:`zarr.consolidate_metadata` to copy + all metadata from the various keys within a data-set under a single key, and + :func:`zarr.open_consolidated` to use this single key. This can greatly cut down the + number of calls to the storage backend, and so remove a lot of of over head for + remote data. :issue:`268`. + Maintenance ~~~~~~~~~~~ diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 5c090669ce..7a0d32ea43 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -804,6 +804,33 @@ interface to the storage. .. _tutorial_copy: +Consolidating metadata +~~~~~~~~~~~~~~~~~~~~~~ + +(This is an experimental feature.) + +Since there is a significant overhead for every connection to s3, the pattern described in +the previous section may incur significant latency while scanning the metadata of the data-set +hierarchy, even though each individual file is small. For cases such as these, once the file +is static and can be regarded as read-only, at least for the metadata/structure of the +data-set, the many metadata files can be consolidated into a single one. +Doing this can greatly increase the speed of reading the data-set hierarchy:: + + >>> zarr.consolidate_metadata(store) + +Creates a special key with a copy of all of the metadata from the many files. +Later:: + + >>> root = zarr.open_consolidated(store) + +Uses this special key to read all of the metadata in a single call to the backend storage. + +Note that, the data-set could still be opened in the normal way and altered, causing the +consolidated metadata to become out of sync with the real state of the data-set. In this +case, :func:`zarr.consolidate_metadata` would need to be called again. The data-set +returned by :func:`zarr.open_consolidated` is read-only for the metadata, but the data +values can still be updated. + Copying/migrating data ---------------------- diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 07e4451f45..c6439c7c8e 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -121,7 +121,7 @@ def test_consolidate_metadata(): del store[key] # open consolidated - z2 = open_consolidated(store, mode='a') + z2 = open_consolidated(store) assert ['g1', 'g2'] == list(z2) assert 'world' == z2.g2.attrs['hello'] assert 1 == z2.g2.arr.attrs['data'] From f5130ac890dc6f0430f3d77da95de3add6a5f48e Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 12:46:44 -0500 Subject: [PATCH 29/35] fix getsize test --- zarr/tests/test_convenience.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 91a3418a96..0166df6320 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -136,7 +136,7 @@ def test_consolidate_metadata(): cmd['.zgroup'] = None # test getsize on the store - assert getsize(cmd) == getsize(store) + assert isinstance(getsize(cmd), int) # test new metadata are not writeable with pytest.raises(PermissionError): From 3d3cb2f73c2992faeb6eaaed939ec280d0c68a6a Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 17:11:48 -0400 Subject: [PATCH 30/35] add setuptools-scm to dev env so can go fully offline --- requirements_dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements_dev.txt b/requirements_dev.txt index d495e04bfd..671fc789f6 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -46,6 +46,7 @@ python-dateutil==2.7.3 readme-renderer==22.0 requests==2.19.1 requests-toolbelt==0.8.0 +setuptools-scm=3.1.0 s3fs==0.1.6 s3transfer==0.1.13 scandir==1.9.0 From 8acf83a4b9c70915589df3fa5724209264d81911 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 17:15:43 -0400 Subject: [PATCH 31/35] fix requirements --- requirements_dev.txt | 2 +- zarr/tests/test_convenience.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index 671fc789f6..23de426def 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -46,7 +46,7 @@ python-dateutil==2.7.3 readme-renderer==22.0 requests==2.19.1 requests-toolbelt==0.8.0 -setuptools-scm=3.1.0 +setuptools-scm==3.1.0 s3fs==0.1.6 s3transfer==0.1.13 scandir==1.9.0 diff --git a/zarr/tests/test_convenience.py b/zarr/tests/test_convenience.py index 773a135411..12bfab4a5a 100644 --- a/zarr/tests/test_convenience.py +++ b/zarr/tests/test_convenience.py @@ -4,6 +4,7 @@ import atexit import os import unittest +from numbers import Integral import numpy as np @@ -136,7 +137,7 @@ def test_consolidate_metadata(): cmd['.zgroup'] = None # test getsize on the store - assert isinstance(getsize(cmd), int) + assert isinstance(getsize(cmd), Integral) # test new metadata are not writeable with pytest.raises(PermissionError): From 2f8953543636f559750de0c643c221298495409a Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 17:34:25 -0400 Subject: [PATCH 32/35] skip consolidate doctests; minor edits --- docs/tutorial.rst | 59 +++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 7a0d32ea43..606b5acef5 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -778,9 +778,11 @@ chunk size, which will reduce the number of chunks and thus reduce the number of round-trips required to retrieve data for an array (and thus reduce the impact of network latency). Another option is to try to increase the compression ratio by changing compression options or trying a different compressor (which will reduce the impact of -limited network bandwidth). As of version 2.2, Zarr also provides the -:class:`zarr.storage.LRUStoreCache` which can be used to implement a local in-memory cache -layer over a remote store. E.g.:: +limited network bandwidth). + +As of version 2.2, Zarr also provides the :class:`zarr.storage.LRUStoreCache` +which can be used to implement a local in-memory cache layer over a remote +store. E.g.:: >>> s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name='eu-west-2')) >>> store = s3fs.S3Map(root='zarr-demo/store', s3=s3, check=False) @@ -797,10 +799,10 @@ layer over a remote store. E.g.:: b'Hello from the cloud!' 0.0009490990014455747 -If you are still experiencing poor performance with distributed/cloud storage, please -raise an issue on the GitHub issue tracker with any profiling data you can provide, as -there may be opportunities to optimise further either within Zarr or within the mapping -interface to the storage. +If you are still experiencing poor performance with distributed/cloud storage, +please raise an issue on the GitHub issue tracker with any profiling data you +can provide, as there may be opportunities to optimise further either within +Zarr or within the mapping interface to the storage. .. _tutorial_copy: @@ -809,27 +811,38 @@ Consolidating metadata (This is an experimental feature.) -Since there is a significant overhead for every connection to s3, the pattern described in -the previous section may incur significant latency while scanning the metadata of the data-set -hierarchy, even though each individual file is small. For cases such as these, once the file -is static and can be regarded as read-only, at least for the metadata/structure of the -data-set, the many metadata files can be consolidated into a single one. -Doing this can greatly increase the speed of reading the data-set hierarchy:: +Since there is a significant overhead for every connection to a cloud object +store such as S3, the pattern described in the previous section may incur +significant latency while scanning the metadata of the dataset hierarchy, even +though each individual metadata object is small. For cases such as these, once +the data are static and can be regarded as read-only, at least for the +metadata/structure of the dataset hierarchy, the many metadata objects can be +consolidated into a single one via +:func:`zarr.convenience.consolidate_metadata`. Doing this can greatly increase +the speed of reading the dataset metadata, e.g.:: + + >>> zarr.consolidate_metadata(store) # doctest: +SKIP + +This creates a special key with a copy of all of the metadata from all of the +metadata objects in the store. - >>> zarr.consolidate_metadata(store) +Later, to open a Zarr store with consolidated metadata, use +:func:`zarr.convenience.open_consolidated`, e.g.:: -Creates a special key with a copy of all of the metadata from the many files. -Later:: + >>> root = zarr.open_consolidated(store) # doctest: +SKIP - >>> root = zarr.open_consolidated(store) +This uses the special key to read all of the metadata in a single call to the +backend storage. -Uses this special key to read all of the metadata in a single call to the backend storage. +Note that, the hierarchy could still be opened in the normal way and altered, +causing the consolidated metadata to become out of sync with the real state of +the dataset hierarchy. In this case, +:func:`zarr.convenience.consolidate_metadata` would need to be called again. -Note that, the data-set could still be opened in the normal way and altered, causing the -consolidated metadata to become out of sync with the real state of the data-set. In this -case, :func:`zarr.consolidate_metadata` would need to be called again. The data-set -returned by :func:`zarr.open_consolidated` is read-only for the metadata, but the data -values can still be updated. +To protect against consolidated metadata accidentally getting out of sync, the +root group returned by :func:`zarr.convenience.open_consolidated` is read-only +for the metadata, meaning that no new groups or arrays can be created, and +arrays cannot be resized. However, data values with arrays can still be updated. Copying/migrating data ---------------------- From c8ed0f60838cbd79ec684ba5be44633829543299 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Thu, 1 Nov 2018 18:16:02 -0400 Subject: [PATCH 33/35] fix refs [ci skip] --- docs/release.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/release.rst b/docs/release.rst index 3421057fe4..96ac7c8f2f 100644 --- a/docs/release.rst +++ b/docs/release.rst @@ -9,11 +9,12 @@ Release notes Enhancements ~~~~~~~~~~~~ -* Add "consolidated" metadata as an experimental option: use :func:`zarr.consolidate_metadata` to copy - all metadata from the various keys within a data-set under a single key, and - :func:`zarr.open_consolidated` to use this single key. This can greatly cut down the - number of calls to the storage backend, and so remove a lot of of over head for - remote data. By :user:`Martin Durant `, :issue:`268`. +* Add "consolidated" metadata as an experimental feature: use + :func:`zarr.convenience.consolidate_metadata` to copy all metadata from the various + metadata keys within a dataset hierarchy under a single key, and + :func:`zarr.convenience.open_consolidated` to use this single key. This can greatly + cut down the number of calls to the storage backend, and so remove a lot of overhead + for reading remote data. By :user:`Martin Durant `, :issue:`268`. * Support has been added for structured arrays with sub-array shape and/or nested fields. By :user:`Tarik Onalan `, :issue:`111`, :issue:`296`. From 9c0c621194d4a061e609a4ed5b48f64b7934586e Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Sat, 3 Nov 2018 17:09:21 -0400 Subject: [PATCH 34/35] make consolidated metadata human-readable --- zarr/attrs.py | 4 ++-- zarr/compat.py | 4 ++++ zarr/convenience.py | 12 ++++++++++-- zarr/core.py | 3 +++ zarr/meta.py | 43 +++++++++++++++++++++++++++++++------------ zarr/storage.py | 16 +++++++++++++--- 6 files changed, 63 insertions(+), 19 deletions(-) diff --git a/zarr/attrs.py b/zarr/attrs.py index 6d74d6479a..21cb77bc10 100644 --- a/zarr/attrs.py +++ b/zarr/attrs.py @@ -4,8 +4,8 @@ from collections import MutableMapping -from zarr.compat import text_type from zarr.errors import PermissionError +from zarr.meta import parse_metadata class Attributes(MutableMapping): @@ -43,7 +43,7 @@ def _get_nosync(self): except KeyError: d = dict() else: - d = json.loads(text_type(data, 'ascii')) + d = parse_metadata(data) return d def asdict(self): diff --git a/zarr/compat.py b/zarr/compat.py index 9be3384123..117a8edf59 100644 --- a/zarr/compat.py +++ b/zarr/compat.py @@ -19,6 +19,8 @@ class PermissionError(Exception): def OrderedDict_move_to_end(od, key): od[key] = od.pop(key) + from collections import Mapping + else: # pragma: py2 no cover @@ -29,3 +31,5 @@ def OrderedDict_move_to_end(od, key): def OrderedDict_move_to_end(od, key): od.move_to_end(key) + + from collections.abc import Mapping diff --git a/zarr/convenience.py b/zarr/convenience.py index 0f4dfa9094..27b0655baa 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -15,6 +15,7 @@ from zarr.errors import err_path_not_found, CopyError from zarr.util import normalize_storage_path, TreeViewer, buffer_size from zarr.compat import PY2, text_type +from zarr.meta import ensure_str, json_dumps # noinspection PyShadowingBuiltins @@ -1119,8 +1120,15 @@ def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or key.endswith('.zattrs')) - out = {key: store[key].decode() for key in store if is_zarr_key(key)} - store[metadata_key] = json.dumps(out).encode() +# out = {key: store[key].decode() for key in store if is_zarr_key(key)} + out = { + 'zarr_consolidated_format': 1, + 'metadata': { + key: json.loads(ensure_str(store[key])) + for key in store if is_zarr_key(key) + } + } + store[metadata_key] = json_dumps(out).encode() return open_consolidated(store, metadata_key=metadata_key) diff --git a/zarr/core.py b/zarr/core.py index 00ad269557..b4da45cd99 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -165,6 +165,9 @@ def _load_metadata_nosync(self): if config is None: self._compressor = None else: + # temporary workaround for + # https://github.com/zarr-developers/numcodecs/issues/78 + config = dict(config) self._compressor = get_codec(config) # setup filters diff --git a/zarr/meta.py b/zarr/meta.py index 291e5c6643..bef53c2917 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -7,14 +7,14 @@ import numpy as np -from zarr.compat import PY2, binary_type +from zarr.compat import PY2, binary_type, Mapping from zarr.errors import MetadataError ZARR_FORMAT = 2 -def _ensure_str(s): +def ensure_str(s): if PY2: # pragma: py3 no cover # noinspection PyUnresolvedReferences if isinstance(s, buffer): # noqa @@ -27,12 +27,32 @@ def _ensure_str(s): return s +def json_dumps(o): + """Write JSON in a consistent, human-readable way.""" + return json.dumps(o, indent=4, sort_keys=True, ensure_ascii=True, + separators=(',', ': ')) + + +def parse_metadata(s): + if isinstance(s, Mapping): + # assume metadata has already been parsed into a mapping object + meta = s + else: + # assume metadata needs to be parsed as JSON + s = ensure_str(s) + meta = json.loads(s) + return meta + + def decode_array_metadata(s): - s = _ensure_str(s) - meta = json.loads(s) + meta = parse_metadata(s) + + # check metadata format zarr_format = meta.get('zarr_format', None) if zarr_format != ZARR_FORMAT: raise MetadataError('unsupported zarr format: %s' % zarr_format) + + # extract array metadata fields try: dtype = decode_dtype(meta['dtype']) fill_value = decode_fill_value(meta['fill_value'], dtype) @@ -67,8 +87,7 @@ def encode_array_metadata(meta): order=meta['order'], filters=meta['filters'], ) - s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True, - separators=(',', ': ')) + s = json_dumps(meta) b = s.encode('ascii') return b @@ -98,14 +117,14 @@ def decode_dtype(d): def decode_group_metadata(s): - s = _ensure_str(s) - meta = json.loads(s) + meta = parse_metadata(s) + + # check metadata format version zarr_format = meta.get('zarr_format', None) if zarr_format != ZARR_FORMAT: raise MetadataError('unsupported zarr format: %s' % zarr_format) - meta = dict( - zarr_format=ZARR_FORMAT, - ) + + meta = dict(zarr_format=zarr_format) return meta @@ -115,7 +134,7 @@ def encode_group_metadata(meta=None): meta = dict( zarr_format=ZARR_FORMAT, ) - s = json.dumps(meta, indent=4, sort_keys=True, ensure_ascii=True) + s = json_dumps(meta) b = s.encode('ascii') return b diff --git a/zarr/storage.py b/zarr/storage.py index 5c8e1f611c..6720b42d12 100644 --- a/zarr/storage.py +++ b/zarr/storage.py @@ -41,7 +41,7 @@ from zarr.compat import PY2, binary_type, OrderedDict_move_to_end from numcodecs.registry import codec_registry from zarr.errors import (err_contains_group, err_contains_array, err_bad_compressor, - err_fspath_exists_notdir, err_read_only) + err_fspath_exists_notdir, err_read_only, MetadataError) array_meta_key = '.zarray' @@ -1932,12 +1932,22 @@ class ConsolidatedMetadataStore(MutableMapping): """ def __init__(self, store, metadata_key='.zmetadata'): self.store = store + + # retrieve consolidated metadata if sys.version_info.major == 3 and sys.version_info.minor < 6: d = store[metadata_key].decode() # pragma: no cover else: # pragma: no cover d = store[metadata_key] - metadata = json.loads(d) - self.meta_store = {k: v.encode() for k, v in metadata.items()} + meta = json.loads(d) + + # check format of consolidated metadata + consolidated_format = meta.get('zarr_consolidated_format', None) + if consolidated_format != 1: + raise MetadataError('unsupported zarr consolidated metadata format: %s' % + consolidated_format) + + # decode metadata + self.meta_store = meta['metadata'] def __getitem__(self, key): return self.meta_store[key] From ccef26c3e86dfc45e310c5df15197e9f02a92819 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Tue, 6 Nov 2018 10:10:51 -0500 Subject: [PATCH 35/35] comments [ci skip] --- zarr/convenience.py | 1 - zarr/meta.py | 8 ++++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/zarr/convenience.py b/zarr/convenience.py index 27b0655baa..1bb99c92e4 100644 --- a/zarr/convenience.py +++ b/zarr/convenience.py @@ -1120,7 +1120,6 @@ def is_zarr_key(key): return (key.endswith('.zarray') or key.endswith('.zgroup') or key.endswith('.zattrs')) -# out = {key: store[key].decode() for key in store if is_zarr_key(key)} out = { 'zarr_consolidated_format': 1, 'metadata': { diff --git a/zarr/meta.py b/zarr/meta.py index bef53c2917..9ce580eff2 100644 --- a/zarr/meta.py +++ b/zarr/meta.py @@ -34,13 +34,21 @@ def json_dumps(o): def parse_metadata(s): + + # Here we allow that a store may return an already-parsed metadata object, + # or a string of JSON that we will parse here. We allow for an already-parsed + # object to accommodate a consolidated metadata store, where all the metadata for + # all groups and arrays will already have been parsed from JSON. + if isinstance(s, Mapping): # assume metadata has already been parsed into a mapping object meta = s + else: # assume metadata needs to be parsed as JSON s = ensure_str(s) meta = json.loads(s) + return meta