From e00bef57682a0bcb8a9ffee012dec09fb5d9e391 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Mon, 9 Jan 2023 10:18:36 +0100 Subject: [PATCH 1/7] initial cep for repodata state --- cep-repodata-state.md | 79 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 cep-repodata-state.md diff --git a/cep-repodata-state.md b/cep-repodata-state.md new file mode 100644 index 00000000..3f7ad2a2 --- /dev/null +++ b/cep-repodata-state.md @@ -0,0 +1,79 @@ + + + + + + + + +
Title .state.json files for repodata metadata
Status Draft
Author(s) Wolf Vollprecht <wolf@prefix.dev>
Created Jan 09, 2023
Updated Jan 09, 2023
Discussion https://conda.slack.com/archives/C017F7C0VM3/p1672669131100819
Implementation https://github.com/mamba-org/mamba/pull/2113
+ +## Abstract + +Changing how conda and mamba store metadata about repodata.json downloads. + +### Motivation + +When conda currently downloads `repodata.json` files from the internet, it stores metadata "inside" the file by adding some JSON keys: + +- `_url`: The URL that was requested +- `_etag`: ETag returned from server +- `_mod`: Last-Modified header from server +- `_cache_control`: Cache-Control header from server + +These are stored as three string values. + +This is not an ideal approach as it modifies the `repodata.json` file and corrupts e.g. the hash of the file. Also, the repodata files have gotten increasingly large, and parsing these state values can require parsing a large `json` file. + +Therefore we propose to store the metadata in a secondary file called `.state.json` file next to the repodata. + +Another motivating factor is that for the `jlap` proposal we need to (repeatedly) compute the hash value of the `repodata.json` file -- that only gives correct results straight away when the repodata is stored externally. + +Both mamba and conda currently use the same cache folder. If both don't implement the same storage strategy but continue to share the same repodata cache, it would lead to frequent cache busting. + +### Specification + +```json +{ + // we ensure that state.json and .json files are in sync by storing the file + // last modified time in the state file, as well as the file size + "file_mtime": { + // seconds and nanoseconds counted from UNIX timestamp (1970-01-01) + "seconds": "", + "nanoseconds": "" + }, + "file_size": "", // file size in bytes + + // The header values as before + "url": "", + "etag: "", + "mod": "", + "cache_control": "", + + // these are alternative encodings of the repodata.json that + // can be used for faster downloading + // both `has_zst` and `has_jlap` keys are optional but should be kept + // even if the other data times out or `file_mtime` does not match + "has_zst": { + // UTC RFC3999 timestamp of when we last checked wether the file is available or not + // in this case the `repodata.json.zst` file + // Note: same format as conda TUF spec + "last_checked": "2023-01-08T11:45:44Z", + // false = unavailable, true = available + "value": false + }, + "has_jlap": { + // same format as `has_zst` + } +} +``` + +If the `state.json` file_mtime or file_size does not match the `.json` file actual `mtime`, the header values are discarded. However, the `has_zst` or `has_jlap` values are kept as they are independent from the repodata validity on disk. + +### Backward compatibility + +Older clients that try to reuse the existing cache will not be able to make use of the cached repodata as they do not know about the state (since it's not written to the same location). That means they will redownload the repodata. + +## Copyright + +All CEPs are explicitly [CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/). \ No newline at end of file From 33180fc69f855eb5188fa2c64f8160475da20a82 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Mon, 9 Jan 2023 10:23:12 +0100 Subject: [PATCH 2/7] fix syntax highlighting --- cep-repodata-state.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cep-repodata-state.md b/cep-repodata-state.md index 3f7ad2a2..3db33a02 100644 --- a/cep-repodata-state.md +++ b/cep-repodata-state.md @@ -33,22 +33,22 @@ Both mamba and conda currently use the same cache folder. If both don't implemen ### Specification -```json +```json5 { // we ensure that state.json and .json files are in sync by storing the file // last modified time in the state file, as well as the file size "file_mtime": { // seconds and nanoseconds counted from UNIX timestamp (1970-01-01) - "seconds": "", - "nanoseconds": "" + "seconds": INTEGER, + "nanoseconds": INTEGER }, - "file_size": "", // file size in bytes + "file_size": INTEGER, // file size in bytes // The header values as before - "url": "", - "etag: "", - "mod": "", - "cache_control": "", + "url": STRING, + "etag": STRING, + "mod": STRING, + "cache_control": STRING, // these are alternative encodings of the repodata.json that // can be used for faster downloading @@ -60,7 +60,7 @@ Both mamba and conda currently use the same cache folder. If both don't implemen // Note: same format as conda TUF spec "last_checked": "2023-01-08T11:45:44Z", // false = unavailable, true = available - "value": false + "value": BOOLEAN }, "has_jlap": { // same format as `has_zst` @@ -76,4 +76,4 @@ Older clients that try to reuse the existing cache will not be able to make use ## Copyright -All CEPs are explicitly [CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/). \ No newline at end of file +All CEPs are explicitly [CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/). From d34ea8de9138616fadff3497119355d1a3612a35 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Mon, 9 Jan 2023 14:00:30 -0500 Subject: [PATCH 3/7] proposed edits --- cep-repodata-state.md | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/cep-repodata-state.md b/cep-repodata-state.md index 3db33a02..48d9b796 100644 --- a/cep-repodata-state.md +++ b/cep-repodata-state.md @@ -21,7 +21,7 @@ When conda currently downloads `repodata.json` files from the internet, it store - `_mod`: Last-Modified header from server - `_cache_control`: Cache-Control header from server -These are stored as three string values. +These are stored as three string values. This is not an ideal approach as it modifies the `repodata.json` file and corrupts e.g. the hash of the file. Also, the repodata files have gotten increasingly large, and parsing these state values can require parsing a large `json` file. @@ -37,12 +37,10 @@ Both mamba and conda currently use the same cache folder. If both don't implemen { // we ensure that state.json and .json files are in sync by storing the file // last modified time in the state file, as well as the file size - "file_mtime": { - // seconds and nanoseconds counted from UNIX timestamp (1970-01-01) - "seconds": INTEGER, - "nanoseconds": INTEGER - }, - "file_size": INTEGER, // file size in bytes + + // seconds and nanoseconds counted from UNIX timestamp (1970-01-01) + "mtime_ns": INTEGER, + "size": INTEGER, // file size in bytes // The header values as before "url": STRING, @@ -50,7 +48,7 @@ Both mamba and conda currently use the same cache folder. If both don't implemen "mod": STRING, "cache_control": STRING, - // these are alternative encodings of the repodata.json that + // these are alternative encodings of the repodata.json that // can be used for faster downloading // both `has_zst` and `has_jlap` keys are optional but should be kept // even if the other data times out or `file_mtime` does not match @@ -58,18 +56,25 @@ Both mamba and conda currently use the same cache folder. If both don't implemen // UTC RFC3999 timestamp of when we last checked wether the file is available or not // in this case the `repodata.json.zst` file // Note: same format as conda TUF spec + // Python's time.time_ns() would be convenient? "last_checked": "2023-01-08T11:45:44Z", // false = unavailable, true = available "value": BOOLEAN }, "has_jlap": { // same format as `has_zst` - } + }, + + "jlap": { } // unspecified additional state for jlap when available } ``` If the `state.json` file_mtime or file_size does not match the `.json` file actual `mtime`, the header values are discarded. However, the `has_zst` or `has_jlap` values are kept as they are independent from the repodata validity on disk. +If the client is tracking `repodata.json.zst` or `repodata.jlap` instead of +`(current_)?repodata.json`, then `etag`/`mod`/`cache_control` will correspond to +those remote files, instead of `repodata.json`. + ### Backward compatibility Older clients that try to reuse the existing cache will not be able to make use of the cached repodata as they do not know about the state (since it's not written to the same location). That means they will redownload the repodata. From 2d32dc82bb93d90a7588be6237595a71a0352923 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Fri, 3 Mar 2023 10:54:07 -0500 Subject: [PATCH 4/7] add refresh_ns field --- cep-repodata-state.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cep-repodata-state.md b/cep-repodata-state.md index 48d9b796..d70e3e6b 100644 --- a/cep-repodata-state.md +++ b/cep-repodata-state.md @@ -42,6 +42,12 @@ Both mamba and conda currently use the same cache folder. If both don't implemen "mtime_ns": INTEGER, "size": INTEGER, // file size in bytes + // most recent remote request e.g. "304 Not Modified", instead + // of touching the cached repodata.json file. + // compare with `cache_control: max-age=`. + // nanosecond-resolution UNIX timestamp. + "refresh_ns": INTEGER, + // The header values as before "url": STRING, "etag": STRING, @@ -69,7 +75,9 @@ Both mamba and conda currently use the same cache folder. If both don't implemen } ``` -If the `state.json` file_mtime or file_size does not match the `.json` file actual `mtime`, the header values are discarded. However, the `has_zst` or `has_jlap` values are kept as they are independent from the repodata validity on disk. +If the `state.json` `mtime_ns` or `size` do not match the `.json` file the +header values are discarded. However, the `has_zst` or `has_jlap` values are kept as +they are independent from the repodata validity on disk. If the client is tracking `repodata.json.zst` or `repodata.jlap` instead of `(current_)?repodata.json`, then `etag`/`mod`/`cache_control` will correspond to From f3b745ca20ee747689c50633acd5b13cf7d216fa Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Tue, 21 Mar 2023 14:37:06 -0400 Subject: [PATCH 5/7] rename to info.json; add hash keys, locking --- cep-repodata-state.md | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/cep-repodata-state.md b/cep-repodata-state.md index d70e3e6b..6a52b621 100644 --- a/cep-repodata-state.md +++ b/cep-repodata-state.md @@ -1,7 +1,8 @@ - + + @@ -25,7 +26,7 @@ These are stored as three string values. This is not an ideal approach as it modifies the `repodata.json` file and corrupts e.g. the hash of the file. Also, the repodata files have gotten increasingly large, and parsing these state values can require parsing a large `json` file. -Therefore we propose to store the metadata in a secondary file called `.state.json` file next to the repodata. +Therefore we propose to store the metadata in a secondary file called `.info.json` file next to the repodata. Another motivating factor is that for the `jlap` proposal we need to (repeatedly) compute the hash value of the `repodata.json` file -- that only gives correct results straight away when the repodata is stored externally. @@ -35,8 +36,8 @@ Both mamba and conda currently use the same cache folder. If both don't implemen ```json5 { - // we ensure that state.json and .json files are in sync by storing the file - // last modified time in the state file, as well as the file size + // we ensure that info.json and .json files are in sync by storing the file + // last modified time in the info file, as well as the file size // seconds and nanoseconds counted from UNIX timestamp (1970-01-01) "mtime_ns": INTEGER, @@ -51,9 +52,15 @@ Both mamba and conda currently use the same cache folder. If both don't implemen // The header values as before "url": STRING, "etag": STRING, - "mod": STRING, + "last_modified": STRING, "cache_control": STRING, + // Hash of the cached-on-disk repodata.json. In Python: hashlib.blake2b(digest_size=32) + "blake2_256": STRING, + // Upstream hash represented by the on-disk file. Used for jlap which + // reformats the cached json but knows equivalent remote repodata.json hashes. + "blake2_256_nominal": STRING, + // these are alternative encodings of the repodata.json that // can be used for faster downloading // both `has_zst` and `has_jlap` keys are optional but should be kept @@ -75,14 +82,29 @@ Both mamba and conda currently use the same cache folder. If both don't implemen } ``` -If the `state.json` `mtime_ns` or `size` do not match the `.json` file the +If the `info.json` `mtime_ns` or `size` do not match the `.json` file the header values are discarded. However, the `has_zst` or `has_jlap` values are kept as they are independent from the repodata validity on disk. If the client is tracking `repodata.json.zst` or `repodata.jlap` instead of -`(current_)?repodata.json`, then `etag`/`mod`/`cache_control` will correspond to +`(current_)?repodata.json`, then `etag`/`last_modified`/`cache_control` will correspond to those remote files, instead of `repodata.json`. +## Locking + +To ensure that the `info.json` is consistent with the cached `.json` even if +multiple programs are trying to update the cache at the same time, locking +should be used. The client uses the `info.json` file as a lock file. It holds an +advisory `fcntl()` or Windows record lock on byte 21 of that file while updating +both the `info.json` and `.json` files. It may or may not additionally lock the +`.json` file. If the lock fails, neither file is changed. + +[A Python implementation](https://github.com/conda/conda/blob/main/conda/gateways/repodata/lock.py) + +This minimal scheme only helps to prevent the cache from being corrupted. +Additional locking would be neded to make it "advisable" to run multiple +installers in parallel. + ### Backward compatibility Older clients that try to reuse the existing cache will not be able to make use of the cached repodata as they do not know about the state (since it's not written to the same location). That means they will redownload the repodata. From c835c463e6304e20b12d263c42b031f630b8c0f0 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Wed, 22 Mar 2023 13:50:13 -0400 Subject: [PATCH 6/7] link to mamba --- cep-repodata-state.md | 1 + 1 file changed, 1 insertion(+) diff --git a/cep-repodata-state.md b/cep-repodata-state.md index 6a52b621..c27d634e 100644 --- a/cep-repodata-state.md +++ b/cep-repodata-state.md @@ -100,6 +100,7 @@ both the `info.json` and `.json` files. It may or may not additionally lock the `.json` file. If the lock fails, neither file is changed. [A Python implementation](https://github.com/conda/conda/blob/main/conda/gateways/repodata/lock.py) +[Mamba's LockFile class](https://github.com/mamba-org/mamba/blob/main/libmamba/include/mamba/core/util.hpp#L167) This minimal scheme only helps to prevent the cache from being corrupted. Additional locking would be neded to make it "advisable" to run multiple From 18288b04c5065511d5d54f720879a7cb163e1157 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Fri, 7 Apr 2023 12:01:10 -0400 Subject: [PATCH 7/7] add jlap metadata; leave mod key --- cep-repodata-state.md | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/cep-repodata-state.md b/cep-repodata-state.md index c27d634e..b61044ac 100644 --- a/cep-repodata-state.md +++ b/cep-repodata-state.md @@ -52,7 +52,7 @@ Both mamba and conda currently use the same cache folder. If both don't implemen // The header values as before "url": STRING, "etag": STRING, - "last_modified": STRING, + "mod": STRING, "cache_control": STRING, // Hash of the cached-on-disk repodata.json. In Python: hashlib.blake2b(digest_size=32) @@ -78,7 +78,19 @@ Both mamba and conda currently use the same cache folder. If both don't implemen // same format as `has_zst` }, - "jlap": { } // unspecified additional state for jlap when available + "jlap": { + // Intermediate checksum leading the second-to-last line of repodata.jlap + "iv": "9448c699e681ee71b4bd524f73a0b690c387df7e9f0ea2bb7ffa24af1c8c27ca", + // Offset of the start of the second-to-last line of repodata.jlap in bytes + "pos": 4226360, + // Last json line of repodata.jlap, before the trailing checksum + "footer": { + "url": "repodata.json", + "latest": "3384620e0f2bf70a418a56db98785dda530b503990e32b68afb11f27e0324d7 + 5" + } + // Other keys may appear e.g. for debugging + } } ```
Title .state.json files for repodata metadata
Title .info.json files for repodata metadata
Status Draft
Author(s) Wolf Vollprecht <wolf@prefix.dev>
Daniel Holth <dholth@anaconda.com>
Created Jan 09, 2023
Updated Jan 09, 2023
Discussion https://conda.slack.com/archives/C017F7C0VM3/p1672669131100819