From 5c68621ac73f4cbfe506d55e41493d934113336f Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 10:05:34 +0800 Subject: [PATCH 01/16] Making MD5 optional for ContentManager --- .../services/contents/filemanager.py | 3 +++ jupyter_server/services/contents/handlers.py | 19 ++++++++++--------- jupyter_server/services/contents/manager.py | 19 ++++++++++++++++--- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/jupyter_server/services/contents/filemanager.py b/jupyter_server/services/contents/filemanager.py index fe027a5c49..94d264e243 100644 --- a/jupyter_server/services/contents/filemanager.py +++ b/jupyter_server/services/contents/filemanager.py @@ -48,6 +48,7 @@ class FileContentsManager(FileManagerMixin, ContentsManager): root_dir = Unicode(config=True) max_copy_folder_size_mb = Int(500, config=True, help="The max folder size that can be copied") + support_md5 = Bool(True, config=False, help="Support md5 argument in `get`") @default("root_dir") def _default_root_dir(self): @@ -725,6 +726,8 @@ def _human_readable_size(self, size): class AsyncFileContentsManager(FileContentsManager, AsyncFileManagerMixin, AsyncContentsManager): """An async file contents manager.""" + support_md5 = Bool(True, config=False, help="Support md5 argument in `get`") + @default("checkpoints_class") def _checkpoints_class_default(self): return AsyncFileCheckpoints diff --git a/jupyter_server/services/contents/handlers.py b/jupyter_server/services/contents/handlers.py index cc5ac5b8ca..00d45ca629 100644 --- a/jupyter_server/services/contents/handlers.py +++ b/jupyter_server/services/contents/handlers.py @@ -145,16 +145,17 @@ async def get(self, path=""): await self._finish_error( HTTPStatus.NOT_FOUND, f"file or directory {path!r} does not exist" ) + kwargs = { + "path": path, + "type": type, + "format": format, + "content": content, + } + if cm.support_md5: + kwargs["md5"] = md5 + try: - model = await ensure_async( - self.contents_manager.get( - path=path, - type=type, - format=format, - content=content, - md5=md5, - ) - ) + model = await ensure_async(self.contents_manager.get(**kwargs)) validate_model(model, expect_content=content, expect_md5=md5) self._finish_model(model, location=False) except web.HTTPError as exc: diff --git a/jupyter_server/services/contents/manager.py b/jupyter_server/services/contents/manager.py index 94684bb022..5bf1a131bb 100644 --- a/jupyter_server/services/contents/manager.py +++ b/jupyter_server/services/contents/manager.py @@ -111,6 +111,7 @@ def _validate_preferred_dir(self, proposal): return value allow_hidden = Bool(False, config=True, help="Allow access to hidden files") + support_md5 = Bool(False, config=False, help="Support md5 argument in `get`") notary = Instance(sign.NotebookNotary) @@ -447,8 +448,14 @@ def exists(self, path): """ return self.file_exists(path) or self.dir_exists(path) - def get(self, path, content=True, type=None, format=None, md5=False): - """Get a file or directory model.""" + def get(self, path, content=True, type=None, format=None): + """ + Get a file or directory model. + + If a ContentManager supports calculating the md5 value of a file, + `ContentManager.support_md5` should be True and this function will accept an `md5` parameter, + will return a dict with an `md5` key. + """ raise NotImplementedError def save(self, model, path): @@ -850,7 +857,13 @@ async def exists(self, path): ) async def get(self, path, content=True, type=None, format=None): - """Get a file or directory model.""" + """ + Get a file or directory model. + + If a ContentManager supports calculating the md5 value of a file, + ContentManager.support_md5 should be True and this function will accept an md5 parameter, + will return a dict with an 'md5' key. + """ raise NotImplementedError async def save(self, model, path): From 0472bd0a6440c49cb47da229c56ec164d6e06cd5 Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 11:31:37 +0800 Subject: [PATCH 02/16] Change md5 to sha256 --- docs/source/developers/contents.rst | 10 ++-- jupyter_server/services/api/api.yaml | 12 ++--- jupyter_server/services/contents/fileio.py | 16 +++--- .../services/contents/filemanager.py | 54 +++++++++---------- jupyter_server/services/contents/handlers.py | 34 ++++++------ jupyter_server/services/contents/manager.py | 14 ++--- tests/services/contents/test_api.py | 12 ++--- tests/services/contents/test_fileio.py | 14 +++-- tests/services/contents/test_manager.py | 8 +-- 9 files changed, 90 insertions(+), 84 deletions(-) diff --git a/docs/source/developers/contents.rst b/docs/source/developers/contents.rst index ca88025c88..79551eff42 100644 --- a/docs/source/developers/contents.rst +++ b/docs/source/developers/contents.rst @@ -63,7 +63,7 @@ Models may contain the following entries: | |``None`` |if any. (:ref:`See | | | |Below`) | +--------------------+-----------+------------------------------+ -|**md5** |unicode or |The md5 of the contents. | +|**sha256** |unicode or |The sha256 of the contents. | | |``None`` | | | | | | +--------------------+-----------+------------------------------+ @@ -80,7 +80,7 @@ model. There are three model types: **notebook**, **file**, and **directory**. :class:`nbformat.notebooknode.NotebookNode` representing the .ipynb file represented by the model. See the `NBFormat`_ documentation for a full description. - - The ``md5`` field a hexdigest string of the md5 value of the notebook + - The ``sha256`` field a hexdigest string of the sha256 value of the notebook file. - ``file`` models @@ -91,14 +91,14 @@ model. There are three model types: **notebook**, **file**, and **directory**. file models, ``content`` simply contains the file's bytes after decoding as UTF-8. Non-text (``base64``) files are read as bytes, base64 encoded, and then decoded as UTF-8. - - The ``md5`` field a hexdigest string of the md5 value of the file. + - The ``sha256`` field a hexdigest string of the sha256 value of the file. - ``directory`` models - The ``format`` field is always ``"json"``. - The ``mimetype`` field is always ``None``. - The ``content`` field contains a list of :ref:`content-free` models representing the entities in the directory. - - The ``md5`` field is always ``None``. + - The ``sha256`` field is always ``None``. .. note:: @@ -137,7 +137,7 @@ model. There are three model types: **notebook**, **file**, and **directory**. "path": "foo/a.ipynb", "type": "notebook", "writable": True, - "md5": "7e47382b370c05a1b14706a2a8aff91a", + "sha256": "f5e43a0b1c2e7836ab3b4d6b1c35c19e2558688de15a6a14e137a59e4715d34b", } # Notebook Model without Content diff --git a/jupyter_server/services/api/api.yaml b/jupyter_server/services/api/api.yaml index 0394093ba9..35010d02a6 100644 --- a/jupyter_server/services/api/api.yaml +++ b/jupyter_server/services/api/api.yaml @@ -106,9 +106,9 @@ paths: in: query description: "Return content (0 for no content, 1 for return content)" type: integer - - name: md5 + - name: sha256 in: query - description: "Return md5 hexdigest string of content (0 for no md5, 1 for return md5)" + description: "Return sha256 hexdigest string of content (0 for no sha256, 1 for return sha256)" type: integer responses: 404: @@ -889,7 +889,7 @@ definitions: kernel: $ref: "#/definitions/Kernel" Contents: - description: "A contents object. The content and format keys may be null if content is not contained. The md5 maybe null if md5 is not contained. If type is 'file', then the mimetype will be null." + description: "A contents object. The content and format keys may be null if content is not contained. The sha256 maybe null if sha256 is not contained. If type is 'file', then the mimetype will be null." type: object required: - type @@ -901,7 +901,7 @@ definitions: - mimetype - format - content - - md5 + - sha256 properties: name: type: string @@ -939,9 +939,9 @@ definitions: format: type: string description: Format of content (one of null, 'text', 'base64', 'json') - md5: + sha256: type: string - description: "The md5 hexdigest string of content, if requested (otherwise null)." + description: "The sha256 hexdigest string of content, if requested (otherwise null)." Checkpoints: description: A checkpoint object. type: object diff --git a/jupyter_server/services/contents/fileio.py b/jupyter_server/services/contents/fileio.py index 5f0aa4a8bf..e8a7f9ff9f 100644 --- a/jupyter_server/services/contents/fileio.py +++ b/jupyter_server/services/contents/fileio.py @@ -357,11 +357,11 @@ def _save_file(self, os_path, content, format): with self.atomic_writing(os_path, text=False) as f: f.write(bcontent) - def _get_md5(self, os_path): + def _get_sha256(self, os_path): c, _ = self._read_file(os_path, "byte") - md5 = hashlib.md5() - md5.update(c) - return md5.hexdigest() + sha256 = hashlib.sha256() + sha256.update(c) + return sha256.hexdigest() class AsyncFileManagerMixin(FileManagerMixin): @@ -475,8 +475,8 @@ async def _save_file(self, os_path, content, format): with self.atomic_writing(os_path, text=False) as f: await run_sync(f.write, bcontent) - async def _get_md5(self, os_path): + async def _get_sha256(self, os_path): c, _ = await self._read_file(os_path, "byte") - md5 = hashlib.md5() - await run_sync(md5.update, c) - return md5.hexdigest() + sha256 = hashlib.sha256() + await run_sync(sha256.update, c) + return sha256.hexdigest() diff --git a/jupyter_server/services/contents/filemanager.py b/jupyter_server/services/contents/filemanager.py index 94d264e243..832413626c 100644 --- a/jupyter_server/services/contents/filemanager.py +++ b/jupyter_server/services/contents/filemanager.py @@ -48,7 +48,7 @@ class FileContentsManager(FileManagerMixin, ContentsManager): root_dir = Unicode(config=True) max_copy_folder_size_mb = Int(500, config=True, help="The max folder size that can be copied") - support_md5 = Bool(True, config=False, help="Support md5 argument in `get`") + support_sha256 = Bool(True, config=False, help="Support sha256 argument in `get`") @default("root_dir") def _default_root_dir(self): @@ -269,7 +269,7 @@ def _base_model(self, path): model["mimetype"] = None model["size"] = size model["writable"] = self.is_writable(path) - model["md5"] = None + model["sha256"] = None return model @@ -337,7 +337,7 @@ def _dir_model(self, path, content=True): return model - def _file_model(self, path, content=True, format=None, md5=False): + def _file_model(self, path, content=True, format=None, sha256=False): """Build a model for a file if content is requested, include the file contents. @@ -366,13 +366,13 @@ def _file_model(self, path, content=True, format=None, md5=False): content=content, format=format, ) - if md5: - md5 = self._get_md5(os_path) - model.update(md5=md5) + if sha256: + sha256 = self._get_sha256(os_path) + model.update(sha256=sha256) return model - def _notebook_model(self, path, content=True, md5=False): + def _notebook_model(self, path, content=True, sha256=False): """Build a notebook model if content is requested, the notebook content will be populated @@ -391,12 +391,12 @@ def _notebook_model(self, path, content=True, md5=False): model["content"] = nb model["format"] = "json" self.validate_notebook_model(model, validation_error) - if md5: - model["md5"] = self._get_md5(os_path) + if sha256: + model["sha256"] = self._get_sha256(os_path) return model - def get(self, path, content=True, type=None, format=None, md5=None): + def get(self, path, content=True, type=None, format=None, sha256=None): """Takes a path for an entity and returns its model Parameters @@ -411,8 +411,8 @@ def get(self, path, content=True, type=None, format=None, md5=None): format : str, optional The requested format for file contents. 'text' or 'base64'. Ignored if this returns a notebook or directory model. - md5: bool, optional - Whether to include the md5 of the file contents. + sha256: bool, optional + Whether to include the sha256 of the file contents. Returns ------- @@ -440,11 +440,11 @@ def get(self, path, content=True, type=None, format=None, md5=None): ) model = self._dir_model(path, content=content) elif type == "notebook" or (type is None and path.endswith(".ipynb")): - model = self._notebook_model(path, content=content, md5=md5) + model = self._notebook_model(path, content=content, sha256=sha256) else: if type == "directory": raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type") - model = self._file_model(path, content=content, format=format, md5=md5) + model = self._file_model(path, content=content, format=format, sha256=sha256) self.emit(data={"action": "get", "path": path}) return model @@ -726,7 +726,7 @@ def _human_readable_size(self, size): class AsyncFileContentsManager(FileContentsManager, AsyncFileManagerMixin, AsyncContentsManager): """An async file contents manager.""" - support_md5 = Bool(True, config=False, help="Support md5 argument in `get`") + support_sha256 = Bool(True, config=False, help="Support sha256 argument in `get`") @default("checkpoints_class") def _checkpoints_class_default(self): @@ -797,7 +797,7 @@ async def _dir_model(self, path, content=True): return model - async def _file_model(self, path, content=True, format=None, md5=False): + async def _file_model(self, path, content=True, format=None, sha256=False): """Build a model for a file if content is requested, include the file contents. @@ -826,13 +826,13 @@ async def _file_model(self, path, content=True, format=None, md5=False): content=content, format=format, ) - if md5: - md5 = await self._get_md5(os_path) - model.update(md5=md5) + if sha256: + sha256 = await self._get_sha256(os_path) + model.update(sha256=sha256) return model - async def _notebook_model(self, path, content=True, md5=False): + async def _notebook_model(self, path, content=True, sha256=False): """Build a notebook model if content is requested, the notebook content will be populated @@ -851,12 +851,12 @@ async def _notebook_model(self, path, content=True, md5=False): model["content"] = nb model["format"] = "json" self.validate_notebook_model(model, validation_error) - if md5: - model["md5"] = await self._get_md5(os_path) + if sha256: + model["sha256"] = await self._get_sha256(os_path) return model - async def get(self, path, content=True, type=None, format=None, md5=False): + async def get(self, path, content=True, type=None, format=None, sha256=False): """Takes a path for an entity and returns its model Parameters @@ -871,8 +871,8 @@ async def get(self, path, content=True, type=None, format=None, md5=False): format : str, optional The requested format for file contents. 'text' or 'base64'. Ignored if this returns a notebook or directory model. - md5: bool, optional - Whether to include the md5 of the file contents. + sha256: bool, optional + Whether to include the sha256 of the file contents. Returns ------- @@ -895,11 +895,11 @@ async def get(self, path, content=True, type=None, format=None, md5=False): ) model = await self._dir_model(path, content=content) elif type == "notebook" or (type is None and path.endswith(".ipynb")): - model = await self._notebook_model(path, content=content, md5=md5) + model = await self._notebook_model(path, content=content, sha256=sha256) else: if type == "directory": raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type") - model = await self._file_model(path, content=content, format=format, md5=md5) + model = await self._file_model(path, content=content, format=format, sha256=sha256) self.emit(data={"action": "get", "path": path}) return model diff --git a/jupyter_server/services/contents/handlers.py b/jupyter_server/services/contents/handlers.py index 00d45ca629..3534e8207d 100644 --- a/jupyter_server/services/contents/handlers.py +++ b/jupyter_server/services/contents/handlers.py @@ -44,14 +44,14 @@ def _validate_in_or_not(expect_in_model: bool, model: Dict[str, Any], maybe_none ) -def validate_model(model, expect_content, expect_md5): +def validate_model(model, expect_content, expect_sha256): """ Validate a model returned by a ContentsManager method. If expect_content is True, then we expect non-null entries for 'content' and 'format'. - If expect_md5 is True, then we expect non-null entries for 'md5'. + If expect_sha256 is True, then we expect non-null entries for 'sha256'. """ required_keys = { "name", @@ -63,7 +63,7 @@ def validate_model(model, expect_content, expect_md5): "mimetype", "content", "format", - "md5", + "sha256", } missing = required_keys - set(model.keys()) if missing: @@ -73,9 +73,9 @@ def validate_model(model, expect_content, expect_md5): ) content_keys = ["content", "format"] - md5_keys = ["md5"] + sha256_keys = ["sha256"] _validate_in_or_not(expect_content, model, content_keys) - _validate_in_or_not(expect_md5, model, md5_keys) + _validate_in_or_not(expect_sha256, model, sha256_keys) class ContentsAPIHandler(APIHandler): @@ -136,10 +136,10 @@ async def get(self, path=""): raise web.HTTPError(400, "Content %r is invalid" % content_str) content = int(content_str or "") - md5_str = self.get_query_argument("md5", default="0") - if md5_str not in {"0", "1"}: - raise web.HTTPError(400, "Content %r is invalid" % md5_str) - md5 = int(md5_str or "") + sha256_str = self.get_query_argument("sha256", default="0") + if sha256_str not in {"0", "1"}: + raise web.HTTPError(400, "Content %r is invalid" % sha256_str) + sha256 = int(sha256_str or "") if not cm.allow_hidden and await ensure_async(cm.is_hidden(path)): await self._finish_error( @@ -151,12 +151,12 @@ async def get(self, path=""): "format": format, "content": content, } - if cm.support_md5: - kwargs["md5"] = md5 + if cm.support_sha256: + kwargs["sha256"] = sha256 try: model = await ensure_async(self.contents_manager.get(**kwargs)) - validate_model(model, expect_content=content, expect_md5=md5) + validate_model(model, expect_content=content, expect_sha256=sha256) self._finish_model(model, location=False) except web.HTTPError as exc: # 404 is okay in this context, catch exception and return 404 code to prevent stack trace on client @@ -186,7 +186,7 @@ async def patch(self, path=""): raise web.HTTPError(400, f"Cannot rename file or directory {path!r}") model = await ensure_async(cm.update(model, path)) - validate_model(model, expect_content=False, expect_md5=False) + validate_model(model, expect_content=False, expect_sha256=False) self._finish_model(model) async def _copy(self, copy_from, copy_to=None): @@ -199,7 +199,7 @@ async def _copy(self, copy_from, copy_to=None): ) model = await ensure_async(self.contents_manager.copy(copy_from, copy_to)) self.set_status(201) - validate_model(model, expect_content=False, expect_md5=False) + validate_model(model, expect_content=False, expect_sha256=False) self._finish_model(model) async def _upload(self, model, path): @@ -207,7 +207,7 @@ async def _upload(self, model, path): self.log.info("Uploading file to %s", path) model = await ensure_async(self.contents_manager.new(model, path)) self.set_status(201) - validate_model(model, expect_content=False, expect_md5=False) + validate_model(model, expect_content=False, expect_sha256=False) self._finish_model(model) async def _new_untitled(self, path, type="", ext=""): @@ -217,7 +217,7 @@ async def _new_untitled(self, path, type="", ext=""): self.contents_manager.new_untitled(path=path, type=type, ext=ext) ) self.set_status(201) - validate_model(model, expect_content=False, expect_md5=False) + validate_model(model, expect_content=False, expect_sha256=False) self._finish_model(model) async def _save(self, model, path): @@ -226,7 +226,7 @@ async def _save(self, model, path): if not chunk or chunk == -1: # Avoid tedious log information self.log.info("Saving file at %s", path) model = await ensure_async(self.contents_manager.save(model, path)) - validate_model(model, expect_content=False, expect_md5=False) + validate_model(model, expect_content=False, expect_sha256=False) self._finish_model(model) @web.authenticated diff --git a/jupyter_server/services/contents/manager.py b/jupyter_server/services/contents/manager.py index 5bf1a131bb..382154b7f3 100644 --- a/jupyter_server/services/contents/manager.py +++ b/jupyter_server/services/contents/manager.py @@ -111,7 +111,7 @@ def _validate_preferred_dir(self, proposal): return value allow_hidden = Bool(False, config=True, help="Allow access to hidden files") - support_md5 = Bool(False, config=False, help="Support md5 argument in `get`") + support_sha256 = Bool(False, config=False, help="Support sha256 argument in `get`") notary = Instance(sign.NotebookNotary) @@ -452,9 +452,9 @@ def get(self, path, content=True, type=None, format=None): """ Get a file or directory model. - If a ContentManager supports calculating the md5 value of a file, - `ContentManager.support_md5` should be True and this function will accept an `md5` parameter, - will return a dict with an `md5` key. + If a ContentManager supports calculating the sha256 value of a file, + `ContentManager.support_sha256` should be True and this function will accept an `sha256` parameter, + will return a dict with an `sha256` key. """ raise NotImplementedError @@ -860,9 +860,9 @@ async def get(self, path, content=True, type=None, format=None): """ Get a file or directory model. - If a ContentManager supports calculating the md5 value of a file, - ContentManager.support_md5 should be True and this function will accept an md5 parameter, - will return a dict with an 'md5' key. + If a ContentManager supports calculating the sha256 value of a file, + ContentManager.support_sha256 should be True and this function will accept an sha256 parameter, + will return a dict with an 'sha256' key. """ raise NotImplementedError diff --git a/tests/services/contents/test_api.py b/tests/services/contents/test_api.py index eb93fd7526..4d75d4ce66 100644 --- a/tests/services/contents/test_api.py +++ b/tests/services/contents/test_api.py @@ -103,15 +103,15 @@ async def test_get_nb_contents(jp_fetch, contents, path, name): @pytest.mark.parametrize("path,name", dirs) -async def test_get_nb_md5(jp_fetch, contents, path, name): +async def test_get_nb_sha256(jp_fetch, contents, path, name): nbname = name + ".ipynb" nbpath = (path + "/" + nbname).lstrip("/") - r = await jp_fetch("api", "contents", nbpath, method="GET", params=dict(md5="1")) + r = await jp_fetch("api", "contents", nbpath, method="GET", params=dict(sha256="1")) model = json.loads(r.body.decode()) assert model["name"] == nbname assert model["path"] == nbpath assert model["type"] == "notebook" - assert "md5" in model + assert "sha256" in model assert "metadata" in model["content"] assert isinstance(model["content"]["metadata"], dict) @@ -201,14 +201,14 @@ async def test_get_text_file_contents(jp_fetch, contents, path, name): @pytest.mark.parametrize("path,name", dirs) -async def test_get_text_file_md5(jp_fetch, contents, path, name): +async def test_get_text_file_sha256(jp_fetch, contents, path, name): txtname = name + ".txt" txtpath = (path + "/" + txtname).lstrip("/") - r = await jp_fetch("api", "contents", txtpath, method="GET", params=dict(md5="1")) + r = await jp_fetch("api", "contents", txtpath, method="GET", params=dict(sha256="1")) model = json.loads(r.body.decode()) assert model["name"] == txtname assert model["path"] == txtpath - assert "md5" in model + assert "sha256" in model assert model["format"] == "text" assert model["type"] == "file" diff --git a/tests/services/contents/test_fileio.py b/tests/services/contents/test_fileio.py index 19060db94a..6ec46f5d00 100644 --- a/tests/services/contents/test_fileio.py +++ b/tests/services/contents/test_fileio.py @@ -142,8 +142,11 @@ def test_file_manager_mixin(tmpdir): mixin.log = logging.getLogger() bad_content = tmpdir / "bad_content.ipynb" bad_content.write_text("{}", "utf8") - # Same as `echo -n {} | md5sum` - assert mixin._get_md5(bad_content) == "99914b932bd37a50b983c5e7c90ae93b" + # Same as `echo -n {} | sha256sum` + assert ( + mixin._get_sha256(bad_content) + == "44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a" + ) with pytest.raises(HTTPError): mixin._read_notebook(bad_content) other = path_to_intermediate(bad_content) @@ -166,8 +169,11 @@ async def test_async_file_manager_mixin(tmpdir): mixin.log = logging.getLogger() bad_content = tmpdir / "bad_content.ipynb" bad_content.write_text("{}", "utf8") - # Same as `echo -n {} | md5sum` - assert await mixin._get_md5(bad_content) == "99914b932bd37a50b983c5e7c90ae93b" + # Same as `echo -n {} | sha256sum` + assert ( + await mixin._get_sha256(bad_content) + == "44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a" + ) with pytest.raises(HTTPError): await mixin._read_notebook(bad_content) other = path_to_intermediate(bad_content) diff --git a/tests/services/contents/test_manager.py b/tests/services/contents/test_manager.py index 3d11a43ad0..62377041e8 100644 --- a/tests/services/contents/test_manager.py +++ b/tests/services/contents/test_manager.py @@ -571,8 +571,8 @@ async def test_get(jp_contents_manager): nb_as_bin_file = await ensure_async(cm.get(path, content=True, type="file", format="base64")) assert nb_as_bin_file["format"] == "base64" - nb_with_md5 = await ensure_async(cm.get(path, md5=True)) - assert nb_with_md5["md5"] + nb_with_sha256 = await ensure_async(cm.get(path, sha256=True)) + assert nb_with_sha256["sha256"] # Test in sub-directory sub_dir = "/foo/" @@ -588,7 +588,7 @@ async def test_get(jp_contents_manager): # Test with a regular file. file_model_path = (await ensure_async(cm.new_untitled(path=sub_dir, ext=".txt")))["path"] - file_model = await ensure_async(cm.get(file_model_path, md5=True)) + file_model = await ensure_async(cm.get(file_model_path, sha256=True)) expected_model = { "content": "", "format": "text", @@ -603,7 +603,7 @@ async def test_get(jp_contents_manager): assert file_model[key] == value assert "created" in file_model assert "last_modified" in file_model - assert "md5" in file_model + assert "sha256" in file_model # Create a sub-sub directory to test getting directory contents with a # subdir. From a1bd1a4829cf609a83341e8798eb80bde87eb659 Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 12:21:27 +0800 Subject: [PATCH 03/16] Add downstream tests for jupytext --- .github/workflows/downstream.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml index 635c404e32..0e1e6ff0b0 100644 --- a/.github/workflows/downstream.yml +++ b/.github/workflows/downstream.yml @@ -107,6 +107,23 @@ jobs: test_command: pip install pytest-jupyter[server] && pytest -vv -raXxs -W default --durations 10 --color=yes package_name: jupyter_server_terminals + jupytext: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Base Setup + uses: jupyterlab/maintainer-tools/.github/actions/base-setup@v1 + + - name: Test jupytext + uses: jupyterlab/maintainer-tools/.github/actions/downstream-test@v1 + with: + package_name: jupytext + test_command: pip install jupytextr[test] && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes + downstream_check: # This job does nothing and is only used for the branch protection if: always() needs: @@ -115,6 +132,7 @@ jobs: - jupyterlab_server - notebook - nbclassic + - jupytext runs-on: ubuntu-latest steps: - name: Decide whether the needed jobs succeeded or failed From f9232ab16e2906af01504d98b1f908222929bcae Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 12:23:47 +0800 Subject: [PATCH 04/16] Fix ci --- .github/workflows/downstream.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml index 0e1e6ff0b0..cb14b3754b 100644 --- a/.github/workflows/downstream.yml +++ b/.github/workflows/downstream.yml @@ -122,7 +122,7 @@ jobs: uses: jupyterlab/maintainer-tools/.github/actions/downstream-test@v1 with: package_name: jupytext - test_command: pip install jupytextr[test] && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes + test_command: pip install jupytext[test] && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes downstream_check: # This job does nothing and is only used for the branch protection if: always() From bf9803857ac666e8ddc82b02452ab5ef81045564 Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 12:45:57 +0800 Subject: [PATCH 05/16] Fix ci --- .github/workflows/downstream.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml index cb14b3754b..0916124622 100644 --- a/.github/workflows/downstream.yml +++ b/.github/workflows/downstream.yml @@ -122,7 +122,7 @@ jobs: uses: jupyterlab/maintainer-tools/.github/actions/downstream-test@v1 with: package_name: jupytext - test_command: pip install jupytext[test] && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes + test_command: pip install pytest-jupyter[server] && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes downstream_check: # This job does nothing and is only used for the branch protection if: always() From e07f573a7e384b8821140466b2d03eab3a852909 Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 12:54:07 +0800 Subject: [PATCH 06/16] Fix ci --- .github/workflows/downstream.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml index 0916124622..4284b1ce2f 100644 --- a/.github/workflows/downstream.yml +++ b/.github/workflows/downstream.yml @@ -118,11 +118,15 @@ jobs: - name: Base Setup uses: jupyterlab/maintainer-tools/.github/actions/base-setup@v1 + - name: Install some missing tests dependencies + run: | + pip install gitpython + - name: Test jupytext uses: jupyterlab/maintainer-tools/.github/actions/downstream-test@v1 with: package_name: jupytext - test_command: pip install pytest-jupyter[server] && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes + test_command: pip install pytest-jupyter[server] && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes --ignore=tests/test_doc_files_are_notebooks.py downstream_check: # This job does nothing and is only used for the branch protection if: always() From f948397f50715a01e1650643ce31c424b364179a Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 13:00:02 +0800 Subject: [PATCH 07/16] Fix CI --- .github/workflows/downstream.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml index 4284b1ce2f..843bb44e34 100644 --- a/.github/workflows/downstream.yml +++ b/.github/workflows/downstream.yml @@ -118,15 +118,11 @@ jobs: - name: Base Setup uses: jupyterlab/maintainer-tools/.github/actions/base-setup@v1 - - name: Install some missing tests dependencies - run: | - pip install gitpython - - name: Test jupytext uses: jupyterlab/maintainer-tools/.github/actions/downstream-test@v1 with: package_name: jupytext - test_command: pip install pytest-jupyter[server] && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes --ignore=tests/test_doc_files_are_notebooks.py + test_command: pip install pytest-jupyter[server] gitpython && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes --ignore=tests/test_doc_files_are_notebooks.py --ignore=tests/test_changelog.py downstream_check: # This job does nothing and is only used for the branch protection if: always() From ed6f1f90e2e8028a0858b1bc90af6c71dc349fce Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 13:02:38 +0800 Subject: [PATCH 08/16] Update downstream test command --- .github/workflows/downstream.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml index 843bb44e34..8763635490 100644 --- a/.github/workflows/downstream.yml +++ b/.github/workflows/downstream.yml @@ -122,7 +122,7 @@ jobs: uses: jupyterlab/maintainer-tools/.github/actions/downstream-test@v1 with: package_name: jupytext - test_command: pip install pytest-jupyter[server] gitpython && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes --ignore=tests/test_doc_files_are_notebooks.py --ignore=tests/test_changelog.py + test_command: pip install pytest-jupyter[server] gitpython pre-commit && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes --ignore=tests/test_doc_files_are_notebooks.py --ignore=tests/test_changelog.py downstream_check: # This job does nothing and is only used for the branch protection if: always() From f02f1e84b94a51af162dab9e0c071ec356be9488 Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 15:28:39 +0800 Subject: [PATCH 09/16] Switch to hash and hash_algorithm --- docs/source/developers/contents.rst | 20 +++-- jupyter_server/services/api/api.yaml | 16 ++-- .../services/contents/filecheckpoints.py | 4 +- jupyter_server/services/contents/fileio.py | 66 +++++++++++----- .../services/contents/filemanager.py | 79 ++++++++++--------- jupyter_server/services/contents/handlers.py | 35 ++++---- jupyter_server/services/contents/manager.py | 16 ++-- tests/services/contents/test_api.py | 28 +++++-- tests/services/contents/test_fileio.py | 4 +- tests/services/contents/test_manager.py | 8 +- 10 files changed, 170 insertions(+), 106 deletions(-) diff --git a/docs/source/developers/contents.rst b/docs/source/developers/contents.rst index 79551eff42..48c329fa2a 100644 --- a/docs/source/developers/contents.rst +++ b/docs/source/developers/contents.rst @@ -63,10 +63,14 @@ Models may contain the following entries: | |``None`` |if any. (:ref:`See | | | |Below`) | +--------------------+-----------+------------------------------+ -|**sha256** |unicode or |The sha256 of the contents. | +|**hash** |unicode or |The hash of the contents. | | |``None`` | | | | | | +--------------------+-----------+------------------------------+ +|**hash_algorithm** |unicode |The algorithm used to compute | +| | |hash value. | +| | | | ++--------------------+-----------+------------------------------+ .. _modelcontent: @@ -80,8 +84,9 @@ model. There are three model types: **notebook**, **file**, and **directory**. :class:`nbformat.notebooknode.NotebookNode` representing the .ipynb file represented by the model. See the `NBFormat`_ documentation for a full description. - - The ``sha256`` field a hexdigest string of the sha256 value of the notebook - file. + - The ``hash`` field a hexdigest string of the hash value of the file. + If ``ContentManager.get`` not support hash, it should always be ``None``. + - ``hash_algorithm`` is the algorithm used to compute the hash value. - ``file`` models - The ``format`` field is either ``"text"`` or ``"base64"``. @@ -91,14 +96,16 @@ model. There are three model types: **notebook**, **file**, and **directory**. file models, ``content`` simply contains the file's bytes after decoding as UTF-8. Non-text (``base64``) files are read as bytes, base64 encoded, and then decoded as UTF-8. - - The ``sha256`` field a hexdigest string of the sha256 value of the file. + - The ``hash`` field a hexdigest string of the hash value of the file. + If ``ContentManager.get`` not support hash, it should always be ``None``. + - ``hash_algorithm`` is the algorithm used to compute the hash value. - ``directory`` models - The ``format`` field is always ``"json"``. - The ``mimetype`` field is always ``None``. - The ``content`` field contains a list of :ref:`content-free` models representing the entities in the directory. - - The ``sha256`` field is always ``None``. + - The ``hash`` field is always ``None``. .. note:: @@ -137,7 +144,8 @@ model. There are three model types: **notebook**, **file**, and **directory**. "path": "foo/a.ipynb", "type": "notebook", "writable": True, - "sha256": "f5e43a0b1c2e7836ab3b4d6b1c35c19e2558688de15a6a14e137a59e4715d34b", + "hash": "f5e43a0b1c2e7836ab3b4d6b1c35c19e2558688de15a6a14e137a59e4715d34b", + "hash_algorithm": "sha256", } # Notebook Model without Content diff --git a/jupyter_server/services/api/api.yaml b/jupyter_server/services/api/api.yaml index 35010d02a6..3b77a60314 100644 --- a/jupyter_server/services/api/api.yaml +++ b/jupyter_server/services/api/api.yaml @@ -106,9 +106,9 @@ paths: in: query description: "Return content (0 for no content, 1 for return content)" type: integer - - name: sha256 + - name: hash in: query - description: "Return sha256 hexdigest string of content (0 for no sha256, 1 for return sha256)" + description: "Return hash hexdigest string of content (0 for no hash, 1 for return hash), when ContentsManager not support hash, it will be ignored." type: integer responses: 404: @@ -889,7 +889,7 @@ definitions: kernel: $ref: "#/definitions/Kernel" Contents: - description: "A contents object. The content and format keys may be null if content is not contained. The sha256 maybe null if sha256 is not contained. If type is 'file', then the mimetype will be null." + description: "A contents object. The content and format keys may be null if content is not contained. The hash maybe null if hash is not required. If type is 'file', then the mimetype will be null." type: object required: - type @@ -901,7 +901,8 @@ definitions: - mimetype - format - content - - sha256 + - hash + - hash_algorithm properties: name: type: string @@ -939,9 +940,12 @@ definitions: format: type: string description: Format of content (one of null, 'text', 'base64', 'json') - sha256: + hash: type: string - description: "The sha256 hexdigest string of content, if requested (otherwise null)." + description: "The hexdigest hash string of content, if requested (otherwise null)." + hash_algorithm: + type: string + ddescription: "The algorithm used to produce the hash." Checkpoints: description: A checkpoint object. type: object diff --git a/jupyter_server/services/contents/filecheckpoints.py b/jupyter_server/services/contents/filecheckpoints.py index f6d1ef44e7..d7c7e77103 100644 --- a/jupyter_server/services/contents/filecheckpoints.py +++ b/jupyter_server/services/contents/filecheckpoints.py @@ -252,7 +252,7 @@ def get_file_checkpoint(self, checkpoint_id, path): if not os.path.isfile(os_checkpoint_path): self.no_such_checkpoint(path, checkpoint_id) - content, format = self._read_file(os_checkpoint_path, format=None) + content, format, _ = self._read_file(os_checkpoint_path, format=None) return { "type": "file", "content": content, @@ -318,7 +318,7 @@ async def get_file_checkpoint(self, checkpoint_id, path): if not os.path.isfile(os_checkpoint_path): self.no_such_checkpoint(path, checkpoint_id) - content, format = await self._read_file(os_checkpoint_path, format=None) + content, format, _ = await self._read_file(os_checkpoint_path, format=None) return { "type": "file", "content": content, diff --git a/jupyter_server/services/contents/fileio.py b/jupyter_server/services/contents/fileio.py index e8a7f9ff9f..99debe8c7a 100644 --- a/jupyter_server/services/contents/fileio.py +++ b/jupyter_server/services/contents/fileio.py @@ -14,7 +14,7 @@ import nbformat from anyio.to_thread import run_sync from tornado.web import HTTPError -from traitlets import Bool +from traitlets import Bool, Unicode from traitlets.config import Configurable from traitlets.config.configurable import LoggingConfigurable @@ -193,6 +193,12 @@ class FileManagerMixin(LoggingConfigurable, Configurable): If set to False, the new notebook is written directly on the old one which could fail (eg: full filesystem or quota )""", ) + hash_algorithm = Unicode( + "sha256", + config=True, + help="hash algorithm to use for file content, support by hashlib", + ) + @contextmanager def open(self, os_path, *args, **kwargs): """wrapper around io.open that turns permission errors into 403""" @@ -305,7 +311,16 @@ def _save_notebook(self, os_path, nb, capture_validation_error=None): capture_validation_error=capture_validation_error, ) - def _read_file(self, os_path, format): + def _get_hash_from_file(self, os_path): + _, _, h = self._read_file(os_path, "byte", require_hash=True) + return h + + def _get_hash_from_content(self, bcontent: bytes): + h = hashlib.new(self.hash_algorithm) + h.update(bcontent) + return h.hexdigest() + + def _read_file(self, os_path, format, require_hash=False): """Read a non-notebook file. os_path: The path to be read. @@ -314,21 +329,27 @@ def _read_file(self, os_path, format): If 'base64', the raw bytes contents will be encoded as base64. If 'byte', the raw bytes contents will be returned. If not specified, try to decode as UTF-8, and fall back to base64 + hash: + If require_hash is 'True', return the hash of the file contents as a hex string. """ if not os.path.isfile(os_path): raise HTTPError(400, "Cannot read non-file %s" % os_path) with self.open(os_path, "rb") as f: bcontent = f.read() + + # Calculate hash value if required + hash_value = self._get_hash_from_content(bcontent) if require_hash else None + if format == "byte": # Not for http response but internal use - return bcontent, "byte" + return bcontent, "byte", hash_value if format is None or format == "text": # Try to interpret as unicode if format is unknown or if unicode # was explicitly requested. try: - return bcontent.decode("utf8"), "text" + return bcontent.decode("utf8"), "text", hash_value except UnicodeError as e: if format == "text": raise HTTPError( @@ -336,7 +357,7 @@ def _read_file(self, os_path, format): "%s is not UTF-8 encoded" % os_path, reason="bad format", ) from e - return encodebytes(bcontent).decode("ascii"), "base64" + return encodebytes(bcontent).decode("ascii"), "base64", hash_value def _save_file(self, os_path, content, format): """Save content of a generic file.""" @@ -357,12 +378,6 @@ def _save_file(self, os_path, content, format): with self.atomic_writing(os_path, text=False) as f: f.write(bcontent) - def _get_sha256(self, os_path): - c, _ = self._read_file(os_path, "byte") - sha256 = hashlib.sha256() - sha256.update(c) - return sha256.hexdigest() - class AsyncFileManagerMixin(FileManagerMixin): """ @@ -423,7 +438,7 @@ async def _save_notebook(self, os_path, nb, capture_validation_error=None): f, ) - async def _read_file(self, os_path, format): + async def _read_file(self, os_path, format, require_hash=False): """Read a non-notebook file. os_path: The path to be read. @@ -432,21 +447,29 @@ async def _read_file(self, os_path, format): If 'base64', the raw bytes contents will be encoded as base64. If 'byte', the raw bytes contents will be returned. If not specified, try to decode as UTF-8, and fall back to base64 + hash: + If require_hash is 'True', return the hash of the file contents as a hex string. """ if not os.path.isfile(os_path): raise HTTPError(400, "Cannot read non-file %s" % os_path) with self.open(os_path, "rb") as f: bcontent = await run_sync(f.read) + + if require_hash: + hash_value = await self._get_hash_from_content(bcontent) + else: + hash_value = None + if format == "byte": # Not for http response but internal use - return bcontent, "byte" + return bcontent, "byte", hash_value if format is None or format == "text": # Try to interpret as unicode if format is unknown or if unicode # was explicitly requested. try: - return bcontent.decode("utf8"), "text" + return bcontent.decode("utf8"), "text", hash_value except UnicodeError as e: if format == "text": raise HTTPError( @@ -454,7 +477,7 @@ async def _read_file(self, os_path, format): "%s is not UTF-8 encoded" % os_path, reason="bad format", ) from e - return encodebytes(bcontent).decode("ascii"), "base64" + return encodebytes(bcontent).decode("ascii"), "base64", hash_value async def _save_file(self, os_path, content, format): """Save content of a generic file.""" @@ -475,8 +498,11 @@ async def _save_file(self, os_path, content, format): with self.atomic_writing(os_path, text=False) as f: await run_sync(f.write, bcontent) - async def _get_sha256(self, os_path): - c, _ = await self._read_file(os_path, "byte") - sha256 = hashlib.sha256() - await run_sync(sha256.update, c) - return sha256.hexdigest() + async def _get_hash_from_content(self, bcontent: bytes): + h = hashlib.new(self.hash_algorithm) + await run_sync(h.update, bcontent) + return h.hexdigest() + + async def _get_hash_from_file(self, os_path): + _, _, h = await self._read_file(os_path, "byte", require_hash=True) + return h diff --git a/jupyter_server/services/contents/filemanager.py b/jupyter_server/services/contents/filemanager.py index 832413626c..a0300a87e0 100644 --- a/jupyter_server/services/contents/filemanager.py +++ b/jupyter_server/services/contents/filemanager.py @@ -48,7 +48,7 @@ class FileContentsManager(FileManagerMixin, ContentsManager): root_dir = Unicode(config=True) max_copy_folder_size_mb = Int(500, config=True, help="The max folder size that can be copied") - support_sha256 = Bool(True, config=False, help="Support sha256 argument in `get`") + support_hash = Bool(True, config=False, help="Support require_hash argument in `get`") @default("root_dir") def _default_root_dir(self): @@ -269,7 +269,8 @@ def _base_model(self, path): model["mimetype"] = None model["size"] = size model["writable"] = self.is_writable(path) - model["sha256"] = None + model["hash"] = None + model["hash_algorithm"] = self.hash_algorithm return model @@ -337,7 +338,7 @@ def _dir_model(self, path, content=True): return model - def _file_model(self, path, content=True, format=None, sha256=False): + def _file_model(self, path, content=True, format=None, require_hash=False): """Build a model for a file if content is requested, include the file contents. @@ -346,6 +347,8 @@ def _file_model(self, path, content=True, format=None, sha256=False): If 'text', the contents will be decoded as UTF-8. If 'base64', the raw bytes contents will be encoded as base64. If not specified, try to decode as UTF-8, and fall back to base64 + + if require_hash is true, the model will include 'hash' """ model = self._base_model(path) model["type"] = "file" @@ -353,8 +356,11 @@ def _file_model(self, path, content=True, format=None, sha256=False): os_path = self._get_os_path(path) model["mimetype"] = mimetypes.guess_type(os_path)[0] - if content: - content, format = self._read_file(os_path, format) + hash_value = None + if content or require_hash: + content, format, hash_value = self._read_file( + os_path, format, require_hash=require_hash + ) if model["mimetype"] is None: default_mime = { "text": "text/plain", @@ -365,18 +371,18 @@ def _file_model(self, path, content=True, format=None, sha256=False): model.update( content=content, format=format, + hash=hash_value, ) - if sha256: - sha256 = self._get_sha256(os_path) - model.update(sha256=sha256) return model - def _notebook_model(self, path, content=True, sha256=False): + def _notebook_model(self, path, content=True, require_hash=False): """Build a notebook model if content is requested, the notebook content will be populated as a JSON structure (not double-serialized) + + if require_hash is true, the model will include 'hash' """ model = self._base_model(path) model["type"] = "notebook" @@ -391,12 +397,13 @@ def _notebook_model(self, path, content=True, sha256=False): model["content"] = nb model["format"] = "json" self.validate_notebook_model(model, validation_error) - if sha256: - model["sha256"] = self._get_sha256(os_path) + if require_hash: + # FIXME: Here we may read file twice while content=True + model["hash"] = self._get_hash_from_file(os_path) return model - def get(self, path, content=True, type=None, format=None, sha256=None): + def get(self, path, content=True, type=None, format=None, require_hash=None): """Takes a path for an entity and returns its model Parameters @@ -411,8 +418,8 @@ def get(self, path, content=True, type=None, format=None, sha256=None): format : str, optional The requested format for file contents. 'text' or 'base64'. Ignored if this returns a notebook or directory model. - sha256: bool, optional - Whether to include the sha256 of the file contents. + require_hash: bool, optional + Whether to include the hash of the file contents. Returns ------- @@ -440,11 +447,13 @@ def get(self, path, content=True, type=None, format=None, sha256=None): ) model = self._dir_model(path, content=content) elif type == "notebook" or (type is None and path.endswith(".ipynb")): - model = self._notebook_model(path, content=content, sha256=sha256) + model = self._notebook_model(path, content=content, require_hash=require_hash) else: if type == "directory": raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type") - model = self._file_model(path, content=content, format=format, sha256=sha256) + model = self._file_model( + path, content=content, format=format, require_hash=require_hash + ) self.emit(data={"action": "get", "path": path}) return model @@ -726,8 +735,6 @@ def _human_readable_size(self, size): class AsyncFileContentsManager(FileContentsManager, AsyncFileManagerMixin, AsyncContentsManager): """An async file contents manager.""" - support_sha256 = Bool(True, config=False, help="Support sha256 argument in `get`") - @default("checkpoints_class") def _checkpoints_class_default(self): return AsyncFileCheckpoints @@ -797,7 +804,7 @@ async def _dir_model(self, path, content=True): return model - async def _file_model(self, path, content=True, format=None, sha256=False): + async def _file_model(self, path, content=True, format=None, require_hash=False): """Build a model for a file if content is requested, include the file contents. @@ -806,6 +813,8 @@ async def _file_model(self, path, content=True, format=None, sha256=False): If 'text', the contents will be decoded as UTF-8. If 'base64', the raw bytes contents will be encoded as base64. If not specified, try to decode as UTF-8, and fall back to base64 + + if require_hash is true, the model will include 'hash' """ model = self._base_model(path) model["type"] = "file" @@ -813,8 +822,10 @@ async def _file_model(self, path, content=True, format=None, sha256=False): os_path = self._get_os_path(path) model["mimetype"] = mimetypes.guess_type(os_path)[0] - if content: - content, format = await self._read_file(os_path, format) + if content or require_hash: + content, format, hash_value = await self._read_file( + os_path, format, require_hash=require_hash + ) if model["mimetype"] is None: default_mime = { "text": "text/plain", @@ -822,17 +833,11 @@ async def _file_model(self, path, content=True, format=None, sha256=False): }[format] model["mimetype"] = default_mime - model.update( - content=content, - format=format, - ) - if sha256: - sha256 = await self._get_sha256(os_path) - model.update(sha256=sha256) + model.update(content=content, format=format, hash=hash_value) return model - async def _notebook_model(self, path, content=True, sha256=False): + async def _notebook_model(self, path, content=True, require_hash=False): """Build a notebook model if content is requested, the notebook content will be populated @@ -851,12 +856,12 @@ async def _notebook_model(self, path, content=True, sha256=False): model["content"] = nb model["format"] = "json" self.validate_notebook_model(model, validation_error) - if sha256: - model["sha256"] = await self._get_sha256(os_path) + if require_hash: + model["hash"] = await self._get_hash_from_file(os_path) return model - async def get(self, path, content=True, type=None, format=None, sha256=False): + async def get(self, path, content=True, type=None, format=None, require_hash=False): """Takes a path for an entity and returns its model Parameters @@ -871,8 +876,8 @@ async def get(self, path, content=True, type=None, format=None, sha256=False): format : str, optional The requested format for file contents. 'text' or 'base64'. Ignored if this returns a notebook or directory model. - sha256: bool, optional - Whether to include the sha256 of the file contents. + require_hash: bool, optional + Whether to include the hash of the file contents. Returns ------- @@ -895,11 +900,13 @@ async def get(self, path, content=True, type=None, format=None, sha256=False): ) model = await self._dir_model(path, content=content) elif type == "notebook" or (type is None and path.endswith(".ipynb")): - model = await self._notebook_model(path, content=content, sha256=sha256) + model = await self._notebook_model(path, content=content, require_hash=require_hash) else: if type == "directory": raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type") - model = await self._file_model(path, content=content, format=format, sha256=sha256) + model = await self._file_model( + path, content=content, format=format, require_hash=require_hash + ) self.emit(data={"action": "get", "path": path}) return model diff --git a/jupyter_server/services/contents/handlers.py b/jupyter_server/services/contents/handlers.py index 3534e8207d..d5c3d90581 100644 --- a/jupyter_server/services/contents/handlers.py +++ b/jupyter_server/services/contents/handlers.py @@ -44,14 +44,14 @@ def _validate_in_or_not(expect_in_model: bool, model: Dict[str, Any], maybe_none ) -def validate_model(model, expect_content, expect_sha256): +def validate_model(model, expect_content, expect_hash): """ Validate a model returned by a ContentsManager method. If expect_content is True, then we expect non-null entries for 'content' and 'format'. - If expect_sha256 is True, then we expect non-null entries for 'sha256'. + If expect_hash is True, then we expect non-null entries for 'hash' and 'hash_algorithm'. """ required_keys = { "name", @@ -63,7 +63,8 @@ def validate_model(model, expect_content, expect_sha256): "mimetype", "content", "format", - "sha256", + "hash", + "hash_algorithm", } missing = required_keys - set(model.keys()) if missing: @@ -73,9 +74,9 @@ def validate_model(model, expect_content, expect_sha256): ) content_keys = ["content", "format"] - sha256_keys = ["sha256"] + hash_keys = ["hash"] _validate_in_or_not(expect_content, model, content_keys) - _validate_in_or_not(expect_sha256, model, sha256_keys) + _validate_in_or_not(expect_hash, model, hash_keys) class ContentsAPIHandler(APIHandler): @@ -136,10 +137,10 @@ async def get(self, path=""): raise web.HTTPError(400, "Content %r is invalid" % content_str) content = int(content_str or "") - sha256_str = self.get_query_argument("sha256", default="0") - if sha256_str not in {"0", "1"}: - raise web.HTTPError(400, "Content %r is invalid" % sha256_str) - sha256 = int(sha256_str or "") + hash_str = self.get_query_argument("hash", default="0") + if hash_str not in {"0", "1"}: + raise web.HTTPError(400, "Content %r is invalid" % hash_str) + require_hash = int(hash_str or "") if not cm.allow_hidden and await ensure_async(cm.is_hidden(path)): await self._finish_error( @@ -151,12 +152,12 @@ async def get(self, path=""): "format": format, "content": content, } - if cm.support_sha256: - kwargs["sha256"] = sha256 + if cm.support_hash: + kwargs["require_hash"] = require_hash try: model = await ensure_async(self.contents_manager.get(**kwargs)) - validate_model(model, expect_content=content, expect_sha256=sha256) + validate_model(model, expect_content=content, expect_hash=require_hash) self._finish_model(model, location=False) except web.HTTPError as exc: # 404 is okay in this context, catch exception and return 404 code to prevent stack trace on client @@ -186,7 +187,7 @@ async def patch(self, path=""): raise web.HTTPError(400, f"Cannot rename file or directory {path!r}") model = await ensure_async(cm.update(model, path)) - validate_model(model, expect_content=False, expect_sha256=False) + validate_model(model, expect_content=False, expect_hash=False) self._finish_model(model) async def _copy(self, copy_from, copy_to=None): @@ -199,7 +200,7 @@ async def _copy(self, copy_from, copy_to=None): ) model = await ensure_async(self.contents_manager.copy(copy_from, copy_to)) self.set_status(201) - validate_model(model, expect_content=False, expect_sha256=False) + validate_model(model, expect_content=False, expect_hash=False) self._finish_model(model) async def _upload(self, model, path): @@ -207,7 +208,7 @@ async def _upload(self, model, path): self.log.info("Uploading file to %s", path) model = await ensure_async(self.contents_manager.new(model, path)) self.set_status(201) - validate_model(model, expect_content=False, expect_sha256=False) + validate_model(model, expect_content=False, expect_hash=False) self._finish_model(model) async def _new_untitled(self, path, type="", ext=""): @@ -217,7 +218,7 @@ async def _new_untitled(self, path, type="", ext=""): self.contents_manager.new_untitled(path=path, type=type, ext=ext) ) self.set_status(201) - validate_model(model, expect_content=False, expect_sha256=False) + validate_model(model, expect_content=False, expect_hash=False) self._finish_model(model) async def _save(self, model, path): @@ -226,7 +227,7 @@ async def _save(self, model, path): if not chunk or chunk == -1: # Avoid tedious log information self.log.info("Saving file at %s", path) model = await ensure_async(self.contents_manager.save(model, path)) - validate_model(model, expect_content=False, expect_sha256=False) + validate_model(model, expect_content=False, expect_hash=False) self._finish_model(model) @web.authenticated diff --git a/jupyter_server/services/contents/manager.py b/jupyter_server/services/contents/manager.py index 382154b7f3..d66a07bdc7 100644 --- a/jupyter_server/services/contents/manager.py +++ b/jupyter_server/services/contents/manager.py @@ -111,7 +111,7 @@ def _validate_preferred_dir(self, proposal): return value allow_hidden = Bool(False, config=True, help="Allow access to hidden files") - support_sha256 = Bool(False, config=False, help="Support sha256 argument in `get`") + support_hash = Bool(False, config=False, help="Support hash argument in `get`") notary = Instance(sign.NotebookNotary) @@ -452,9 +452,10 @@ def get(self, path, content=True, type=None, format=None): """ Get a file or directory model. - If a ContentManager supports calculating the sha256 value of a file, - `ContentManager.support_sha256` should be True and this function will accept an `sha256` parameter, - will return a dict with an `sha256` key. + Support for hash: + If a ContentManager supports calculating the hash value of a file, + `ContentManager.support_hash` should be True and this function will accept an `require_hash` parameter, + will return a dict with `hash` and `hash_algorithm` key. """ raise NotImplementedError @@ -860,9 +861,10 @@ async def get(self, path, content=True, type=None, format=None): """ Get a file or directory model. - If a ContentManager supports calculating the sha256 value of a file, - ContentManager.support_sha256 should be True and this function will accept an sha256 parameter, - will return a dict with an 'sha256' key. + Support for hash: + If a ContentManager supports calculating the hash value of a file, + `ContentManager.support_hash` should be True and this function will accept an `require_hash` parameter, + will return a dict with `hash` and `hash_algorithm` key. """ raise NotImplementedError diff --git a/tests/services/contents/test_api.py b/tests/services/contents/test_api.py index 4d75d4ce66..abab48445a 100644 --- a/tests/services/contents/test_api.py +++ b/tests/services/contents/test_api.py @@ -97,21 +97,26 @@ async def test_get_nb_contents(jp_fetch, contents, path, name): assert model["path"] == nbpath assert model["type"] == "notebook" assert "content" in model + assert "hash" in model + assert model["hash"] == None + assert "hash_algorithm" in model assert model["format"] == "json" assert "metadata" in model["content"] assert isinstance(model["content"]["metadata"], dict) @pytest.mark.parametrize("path,name", dirs) -async def test_get_nb_sha256(jp_fetch, contents, path, name): +async def test_get_nb_hash(jp_fetch, contents, path, name): nbname = name + ".ipynb" nbpath = (path + "/" + nbname).lstrip("/") - r = await jp_fetch("api", "contents", nbpath, method="GET", params=dict(sha256="1")) + r = await jp_fetch("api", "contents", nbpath, method="GET", params=dict(hash="1")) model = json.loads(r.body.decode()) assert model["name"] == nbname assert model["path"] == nbpath assert model["type"] == "notebook" - assert "sha256" in model + assert "hash" in model + assert model["hash"] + assert "hash_algorithm" in model assert "metadata" in model["content"] assert isinstance(model["content"]["metadata"], dict) @@ -125,6 +130,9 @@ async def test_get_nb_no_contents(jp_fetch, contents, path, name): assert model["name"] == nbname assert model["path"] == nbpath assert model["type"] == "notebook" + assert "hash" in model + assert model["hash"] == None + assert "hash_algorithm" in model assert "content" in model assert model["content"] is None @@ -175,6 +183,9 @@ async def test_get_text_file_contents(jp_fetch, contents, path, name): model = json.loads(r.body.decode()) assert model["name"] == txtname assert model["path"] == txtpath + assert "hash" in model + assert model["hash"] == None + assert "hash_algorithm" in model assert "content" in model assert model["format"] == "text" assert model["type"] == "file" @@ -201,14 +212,16 @@ async def test_get_text_file_contents(jp_fetch, contents, path, name): @pytest.mark.parametrize("path,name", dirs) -async def test_get_text_file_sha256(jp_fetch, contents, path, name): +async def test_get_text_file_hash(jp_fetch, contents, path, name): txtname = name + ".txt" txtpath = (path + "/" + txtname).lstrip("/") - r = await jp_fetch("api", "contents", txtpath, method="GET", params=dict(sha256="1")) + r = await jp_fetch("api", "contents", txtpath, method="GET", params=dict(hash="1")) model = json.loads(r.body.decode()) assert model["name"] == txtname assert model["path"] == txtpath - assert "sha256" in model + assert "hash" in model + assert model["hash"] + assert "hash_algorithm" in model assert model["format"] == "text" assert model["type"] == "file" @@ -253,6 +266,9 @@ async def test_get_binary_file_contents(jp_fetch, contents, path, name): assert model["name"] == blobname assert model["path"] == blobpath assert "content" in model + assert "hash" in model + assert model["hash"] == None + assert "hash_algorithm" in model assert model["format"] == "base64" assert model["type"] == "file" data_out = decodebytes(model["content"].encode("ascii")) diff --git a/tests/services/contents/test_fileio.py b/tests/services/contents/test_fileio.py index 6ec46f5d00..c31769f13c 100644 --- a/tests/services/contents/test_fileio.py +++ b/tests/services/contents/test_fileio.py @@ -144,7 +144,7 @@ def test_file_manager_mixin(tmpdir): bad_content.write_text("{}", "utf8") # Same as `echo -n {} | sha256sum` assert ( - mixin._get_sha256(bad_content) + mixin._get_hash_from_file(bad_content) == "44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a" ) with pytest.raises(HTTPError): @@ -171,7 +171,7 @@ async def test_async_file_manager_mixin(tmpdir): bad_content.write_text("{}", "utf8") # Same as `echo -n {} | sha256sum` assert ( - await mixin._get_sha256(bad_content) + await mixin._get_hash_from_file(bad_content) == "44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a" ) with pytest.raises(HTTPError): diff --git a/tests/services/contents/test_manager.py b/tests/services/contents/test_manager.py index 62377041e8..3c3b480132 100644 --- a/tests/services/contents/test_manager.py +++ b/tests/services/contents/test_manager.py @@ -571,8 +571,8 @@ async def test_get(jp_contents_manager): nb_as_bin_file = await ensure_async(cm.get(path, content=True, type="file", format="base64")) assert nb_as_bin_file["format"] == "base64" - nb_with_sha256 = await ensure_async(cm.get(path, sha256=True)) - assert nb_with_sha256["sha256"] + nb_with_hash = await ensure_async(cm.get(path, require_hash=True)) + assert nb_with_hash["hash"] # Test in sub-directory sub_dir = "/foo/" @@ -588,7 +588,7 @@ async def test_get(jp_contents_manager): # Test with a regular file. file_model_path = (await ensure_async(cm.new_untitled(path=sub_dir, ext=".txt")))["path"] - file_model = await ensure_async(cm.get(file_model_path, sha256=True)) + file_model = await ensure_async(cm.get(file_model_path, require_hash=True)) expected_model = { "content": "", "format": "text", @@ -603,7 +603,7 @@ async def test_get(jp_contents_manager): assert file_model[key] == value assert "created" in file_model assert "last_modified" in file_model - assert "sha256" in file_model + assert "hash" in file_model # Create a sub-sub directory to test getting directory contents with a # subdir. From 87aaa5baf6c001114860802a45bd7870dae6d1f4 Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 15:33:04 +0800 Subject: [PATCH 10/16] Fix typo --- jupyter_server/services/contents/manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jupyter_server/services/contents/manager.py b/jupyter_server/services/contents/manager.py index d66a07bdc7..1c0455c4c9 100644 --- a/jupyter_server/services/contents/manager.py +++ b/jupyter_server/services/contents/manager.py @@ -111,7 +111,7 @@ def _validate_preferred_dir(self, proposal): return value allow_hidden = Bool(False, config=True, help="Allow access to hidden files") - support_hash = Bool(False, config=False, help="Support hash argument in `get`") + support_hash = Bool(False, config=False, help="Support require_hash argument in `get`") notary = Instance(sign.NotebookNotary) From f655074051f77f3fd10e4216462c213be3573c7f Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 15:36:07 +0800 Subject: [PATCH 11/16] Add comment --- jupyter_server/services/contents/handlers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/jupyter_server/services/contents/handlers.py b/jupyter_server/services/contents/handlers.py index d5c3d90581..cbd71a1b27 100644 --- a/jupyter_server/services/contents/handlers.py +++ b/jupyter_server/services/contents/handlers.py @@ -152,6 +152,9 @@ async def get(self, path=""): "format": format, "content": content, } + + # See https://github.com/jupyter-server/jupyter_server/issues/1366 + # try not breaking `ContentManager.get` API while intro `hash` argument to ContentManager if cm.support_hash: kwargs["require_hash"] = require_hash From 6353c6ab8c3c56ab816fbb53b222964f03d5b040 Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Thu, 23 Nov 2023 15:39:29 +0800 Subject: [PATCH 12/16] Add more comment --- jupyter_server/services/contents/filemanager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/jupyter_server/services/contents/filemanager.py b/jupyter_server/services/contents/filemanager.py index a0300a87e0..f01d22620a 100644 --- a/jupyter_server/services/contents/filemanager.py +++ b/jupyter_server/services/contents/filemanager.py @@ -857,6 +857,7 @@ async def _notebook_model(self, path, content=True, require_hash=False): model["format"] = "json" self.validate_notebook_model(model, validation_error) if require_hash: + # FIXME: Here we may read file twice while content=True model["hash"] = await self._get_hash_from_file(os_path) return model From f5956f5babf91497a482c7958b5d5cbe1bf6c03d Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Fri, 24 Nov 2023 00:33:26 +0800 Subject: [PATCH 13/16] cherry-pick https://github.com/fcollonval/jupyter_server/pull/1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Frédéric Collonval --- docs/source/developers/contents.rst | 80 +++--- jupyter_server/services/api/api.yaml | 8 +- .../services/contents/filecheckpoints.py | 4 +- jupyter_server/services/contents/fileio.py | 271 +++++++++++------- .../services/contents/filemanager.py | 58 ++-- jupyter_server/services/contents/handlers.py | 73 +++-- jupyter_server/services/contents/manager.py | 31 +- tests/services/contents/test_api.py | 10 +- tests/services/contents/test_fileio.py | 55 +++- tests/services/contents/test_manager.py | 31 +- .../services/contents/test_manager_no_hash.py | 44 +++ 11 files changed, 421 insertions(+), 244 deletions(-) create mode 100644 tests/services/contents/test_manager_no_hash.py diff --git a/docs/source/developers/contents.rst b/docs/source/developers/contents.rst index 48c329fa2a..f61e5d36fb 100644 --- a/docs/source/developers/contents.rst +++ b/docs/source/developers/contents.rst @@ -33,44 +33,46 @@ which we refer to as **models**. Models may contain the following entries: -+--------------------+-----------+------------------------------+ -| Key | Type |Info | -+====================+===========+==============================+ -|**name** |unicode |Basename of the entity. | -+--------------------+-----------+------------------------------+ -|**path** |unicode |Full | -| | |(:ref:`API-style`) | -| | |path to the entity. | -+--------------------+-----------+------------------------------+ -|**type** |unicode |The entity type. One of | -| | |``"notebook"``, ``"file"`` or | -| | |``"directory"``. | -+--------------------+-----------+------------------------------+ -|**created** |datetime |Creation date of the entity. | -+--------------------+-----------+------------------------------+ -|**last_modified** |datetime |Last modified date of the | -| | |entity. | -+--------------------+-----------+------------------------------+ -|**content** |variable |The "content" of the entity. | -| | |(:ref:`See | -| | |Below`) | -+--------------------+-----------+------------------------------+ -|**mimetype** |unicode or |The mimetype of ``content``, | -| |``None`` |if any. (:ref:`See | -| | |Below`) | -+--------------------+-----------+------------------------------+ -|**format** |unicode or |The format of ``content``, | -| |``None`` |if any. (:ref:`See | -| | |Below`) | -+--------------------+-----------+------------------------------+ -|**hash** |unicode or |The hash of the contents. | -| |``None`` | | -| | | | -+--------------------+-----------+------------------------------+ -|**hash_algorithm** |unicode |The algorithm used to compute | -| | |hash value. | -| | | | -+--------------------+-----------+------------------------------+ ++--------------------+------------+-----------------------------------------+ +| Key | Type | Info | ++====================+============+=========================================+ +| **name** | unicode | Basename of the entity. | ++--------------------+------------+-----------------------------------------+ +| **path** | unicode | Full | +| | | (:ref:`API-style`) | +| | | path to the entity. | ++--------------------+------------+-----------------------------------------+ +| **type** | unicode | The entity type. One of | +| | | ``"notebook"``, ``"file"`` or | +| | | ``"directory"``. | ++--------------------+------------+-----------------------------------------+ +| **created** | datetime | Creation date of the entity. | ++--------------------+------------+-----------------------------------------+ +| **last_modified** | datetime | Last modified date of the | +| | | entity. | ++--------------------+------------+-----------------------------------------+ +| **content** | variable | The "content" of the entity. | +| | | (:ref:`See | +| | | Below`) | ++--------------------+------------+-----------------------------------------+ +| **mimetype** | unicode or | The mimetype of ``content``, | +| | ``None`` | if any. (:ref:`See | +| | | Below`) | ++--------------------+------------+-----------------------------------------+ +| **format** | unicode or | The format of ``content``, | +| | ``None`` | if any. (:ref:`See | +| | | Below`) | ++--------------------+------------+-----------------------------------------+ +| [optional] | | | +| **hash** | unicode or | The hash of the contents. | +| | ``None`` | It cannot be null if ``hash_algorithm`` | +| | | is defined. | ++--------------------+------------+-----------------------------------------+ +| [optional] | | | +| **hash_algorithm** | unicode or | The algorithm used to compute | +| | ``None`` | hash value. | +| | | It cannot be null if hash is defined. | ++--------------------+------------+-----------------------------------------+ .. _modelcontent: @@ -122,7 +124,7 @@ model. There are three model types: **notebook**, **file**, and **directory**. .. code-block:: python - # Notebook Model with Content + # Notebook Model with Content and Hash { "content": { "metadata": {}, diff --git a/jupyter_server/services/api/api.yaml b/jupyter_server/services/api/api.yaml index 3b77a60314..5ee5c416bd 100644 --- a/jupyter_server/services/api/api.yaml +++ b/jupyter_server/services/api/api.yaml @@ -108,7 +108,7 @@ paths: type: integer - name: hash in: query - description: "Return hash hexdigest string of content (0 for no hash, 1 for return hash), when ContentsManager not support hash, it will be ignored." + description: "May return hash hexdigest string of content and the hash algorithm (0 for no hash - default, 1 for return hash). It may be ignored by the content manager." type: integer responses: 404: @@ -901,8 +901,6 @@ definitions: - mimetype - format - content - - hash - - hash_algorithm properties: name: type: string @@ -942,10 +940,10 @@ definitions: description: Format of content (one of null, 'text', 'base64', 'json') hash: type: string - description: "The hexdigest hash string of content, if requested (otherwise null)." + description: "[optional] The hexdigest hash string of content, if requested (otherwise null). It cannot be null if hash_algorithm is defined." hash_algorithm: type: string - ddescription: "The algorithm used to produce the hash." + description: "[optional] The algorithm used to produce the hash, if requested (otherwise null). It cannot be null if hash is defined." Checkpoints: description: A checkpoint object. type: object diff --git a/jupyter_server/services/contents/filecheckpoints.py b/jupyter_server/services/contents/filecheckpoints.py index d7c7e77103..f6d1ef44e7 100644 --- a/jupyter_server/services/contents/filecheckpoints.py +++ b/jupyter_server/services/contents/filecheckpoints.py @@ -252,7 +252,7 @@ def get_file_checkpoint(self, checkpoint_id, path): if not os.path.isfile(os_checkpoint_path): self.no_such_checkpoint(path, checkpoint_id) - content, format, _ = self._read_file(os_checkpoint_path, format=None) + content, format = self._read_file(os_checkpoint_path, format=None) return { "type": "file", "content": content, @@ -318,7 +318,7 @@ async def get_file_checkpoint(self, checkpoint_id, path): if not os.path.isfile(os_checkpoint_path): self.no_such_checkpoint(path, checkpoint_id) - content, format, _ = await self._read_file(os_checkpoint_path, format=None) + content, format = await self._read_file(os_checkpoint_path, format=None) return { "type": "file", "content": content, diff --git a/jupyter_server/services/contents/fileio.py b/jupyter_server/services/contents/fileio.py index 99debe8c7a..a404ee91b3 100644 --- a/jupyter_server/services/contents/fileio.py +++ b/jupyter_server/services/contents/fileio.py @@ -3,6 +3,9 @@ """ # Copyright (c) Jupyter Development Team. # Distributed under the terms of the Modified BSD License. + +from __future__ import annotations + import errno import hashlib import os @@ -14,7 +17,7 @@ import nbformat from anyio.to_thread import run_sync from tornado.web import HTTPError -from traitlets import Bool, Unicode +from traitlets import Bool, Enum from traitlets.config import Configurable from traitlets.config.configurable import LoggingConfigurable @@ -193,10 +196,11 @@ class FileManagerMixin(LoggingConfigurable, Configurable): If set to False, the new notebook is written directly on the old one which could fail (eg: full filesystem or quota )""", ) - hash_algorithm = Unicode( - "sha256", + hash_algorithm = Enum( + hashlib.algorithms_available, + default_value="sha256", config=True, - help="hash algorithm to use for file content, support by hashlib", + help="Hash algorithm to use for file content, support by hashlib", ) @contextmanager @@ -270,37 +274,42 @@ def _get_os_path(self, path): raise HTTPError(404, "%s is outside root contents directory" % path) return os_path - def _read_notebook(self, os_path, as_version=4, capture_validation_error=None): + def _read_notebook( + self, os_path, as_version=4, capture_validation_error=None, raw: bool = False + ): """Read a notebook from an os path.""" - with self.open(os_path, "r", encoding="utf-8") as f: - try: - return nbformat.read( - f, - as_version=as_version, - capture_validation_error=capture_validation_error, - ) - except Exception as e: - e_orig = e - - # If use_atomic_writing is enabled, we'll guess that it was also - # enabled when this notebook was written and look for a valid - # atomic intermediate. - tmp_path = path_to_intermediate(os_path) - - if not self.use_atomic_writing or not os.path.exists(tmp_path): - raise HTTPError( - 400, - f"Unreadable Notebook: {os_path} {e_orig!r}", - ) + answer = self._read_file(os_path, "text", raw=raw) + + try: + nb = nbformat.reads( + answer[0], + as_version=as_version, + capture_validation_error=capture_validation_error, + ) + + return (nb, answer[2]) if raw else nb + except Exception as e: + e_orig = e + + # If use_atomic_writing is enabled, we'll guess that it was also + # enabled when this notebook was written and look for a valid + # atomic intermediate. + tmp_path = path_to_intermediate(os_path) - # Move the bad file aside, restore the intermediate, and try again. - invalid_file = path_to_invalid(os_path) - replace_file(os_path, invalid_file) - replace_file(tmp_path, os_path) - return self._read_notebook( - os_path, as_version, capture_validation_error=capture_validation_error + if not self.use_atomic_writing or not os.path.exists(tmp_path): + raise HTTPError( + 400, + f"Unreadable Notebook: {os_path} {e_orig!r}", ) + # Move the bad file aside, restore the intermediate, and try again. + invalid_file = path_to_invalid(os_path) + replace_file(os_path, invalid_file) + replace_file(tmp_path, os_path) + return self._read_notebook( + os_path, as_version, capture_validation_error=capture_validation_error, raw=raw + ) + def _save_notebook(self, os_path, nb, capture_validation_error=None): """Save a notebook to an os_path.""" with self.atomic_writing(os_path, encoding="utf-8") as f: @@ -311,26 +320,46 @@ def _save_notebook(self, os_path, nb, capture_validation_error=None): capture_validation_error=capture_validation_error, ) - def _get_hash_from_file(self, os_path): - _, _, h = self._read_file(os_path, "byte", require_hash=True) - return h + def _get_hash(self, byte_content: bytes) -> dict[str, str]: + """Compute the hash hexdigest for the provided bytes. - def _get_hash_from_content(self, bcontent: bytes): - h = hashlib.new(self.hash_algorithm) - h.update(bcontent) - return h.hexdigest() + The hash algorithm is provided by the `hash_algorithm` attribute. - def _read_file(self, os_path, format, require_hash=False): + Parameters + ---------- + byte_content : bytes + The bytes to hash + + Returns + ------- + A dictionary to be appended to a model {"hash": str, "hash_algorithm": str}. + """ + algorithm = self.hash_algorithm + h = hashlib.new(algorithm) + h.update(byte_content) + return {"hash": h.hexdigest(), "hash_algorithm": algorithm} + + def _read_file( + self, os_path: str, format: str, raw: bool = False + ) -> tuple[str, str] | tuple[str, str, bytes]: """Read a non-notebook file. - os_path: The path to be read. - format: - If 'text', the contents will be decoded as UTF-8. - If 'base64', the raw bytes contents will be encoded as base64. - If 'byte', the raw bytes contents will be returned. - If not specified, try to decode as UTF-8, and fall back to base64 - hash: - If require_hash is 'True', return the hash of the file contents as a hex string. + Parameters + ---------- + os_path: str + The path to be read. + format: str + If 'text', the contents will be decoded as UTF-8. + If 'base64', the raw bytes contents will be encoded as base64. + If 'byte', the raw bytes contents will be returned. + If not specified, try to decode as UTF-8, and fall back to base64 + raw: bool + [Optional] If True, will return as third argument the raw bytes content + + Returns + ------- + (content, format, byte_content) It returns the content in the given format + as well as the raw byte content. """ if not os.path.isfile(os_path): raise HTTPError(400, "Cannot read non-file %s" % os_path) @@ -338,18 +367,22 @@ def _read_file(self, os_path, format, require_hash=False): with self.open(os_path, "rb") as f: bcontent = f.read() - # Calculate hash value if required - hash_value = self._get_hash_from_content(bcontent) if require_hash else None - if format == "byte": # Not for http response but internal use - return bcontent, "byte", hash_value + return (bcontent, "byte", bcontent) if raw else (bcontent, "byte") if format is None or format == "text": # Try to interpret as unicode if format is unknown or if unicode # was explicitly requested. try: - return bcontent.decode("utf8"), "text", hash_value + return ( + (bcontent.decode("utf8"), "text", bcontent) + if raw + else ( + bcontent.decode("utf8"), + "text", + ) + ) except UnicodeError as e: if format == "text": raise HTTPError( @@ -357,7 +390,14 @@ def _read_file(self, os_path, format, require_hash=False): "%s is not UTF-8 encoded" % os_path, reason="bad format", ) from e - return encodebytes(bcontent).decode("ascii"), "base64", hash_value + return ( + (encodebytes(bcontent).decode("ascii"), "base64", bcontent) + if raw + else ( + encodebytes(bcontent).decode("ascii"), + "base64", + ) + ) def _save_file(self, os_path, content, format): """Save content of a generic file.""" @@ -391,40 +431,46 @@ async def _copy(self, src, dest): """ await async_copy2_safe(src, dest, log=self.log) - async def _read_notebook(self, os_path, as_version=4, capture_validation_error=None): + async def _read_notebook( + self, os_path, as_version=4, capture_validation_error=None, raw: bool = False + ): """Read a notebook from an os path.""" - with self.open(os_path, encoding="utf-8") as f: - try: - return await run_sync( - partial( - nbformat.read, - as_version=as_version, - capture_validation_error=capture_validation_error, - ), - f, - ) - except Exception as e: - e_orig = e - - # If use_atomic_writing is enabled, we'll guess that it was also - # enabled when this notebook was written and look for a valid - # atomic intermediate. - tmp_path = path_to_intermediate(os_path) - - if not self.use_atomic_writing or not os.path.exists(tmp_path): - raise HTTPError( - 400, - f"Unreadable Notebook: {os_path} {e_orig!r}", - ) + answer = await self._read_file(os_path, "text", raw) + + try: + nb = await run_sync( + partial( + nbformat.reads, + as_version=as_version, + capture_validation_error=capture_validation_error, + ), + answer[0], + ) + return (nb, answer[2]) if raw else nb + except Exception as e: + e_orig = e - # Move the bad file aside, restore the intermediate, and try again. - invalid_file = path_to_invalid(os_path) - await async_replace_file(os_path, invalid_file) - await async_replace_file(tmp_path, os_path) - return await self._read_notebook( - os_path, as_version, capture_validation_error=capture_validation_error + # If use_atomic_writing is enabled, we'll guess that it was also + # enabled when this notebook was written and look for a valid + # atomic intermediate. + tmp_path = path_to_intermediate(os_path) + + if not self.use_atomic_writing or not os.path.exists(tmp_path): + raise HTTPError( + 400, + f"Unreadable Notebook: {os_path} {e_orig!r}", ) + # Move the bad file aside, restore the intermediate, and try again. + invalid_file = path_to_invalid(os_path) + await async_replace_file(os_path, invalid_file) + await async_replace_file(tmp_path, os_path) + answer = await self._read_notebook( + os_path, as_version, capture_validation_error=capture_validation_error, raw=raw + ) + + return answer + async def _save_notebook(self, os_path, nb, capture_validation_error=None): """Save a notebook to an os_path.""" with self.atomic_writing(os_path, encoding="utf-8") as f: @@ -438,17 +484,27 @@ async def _save_notebook(self, os_path, nb, capture_validation_error=None): f, ) - async def _read_file(self, os_path, format, require_hash=False): + async def _read_file( + self, os_path: str, format: str, raw: bool = False + ) -> tuple[str, str] | tuple[str, str, bytes]: """Read a non-notebook file. - os_path: The path to be read. - format: - If 'text', the contents will be decoded as UTF-8. - If 'base64', the raw bytes contents will be encoded as base64. - If 'byte', the raw bytes contents will be returned. - If not specified, try to decode as UTF-8, and fall back to base64 - hash: - If require_hash is 'True', return the hash of the file contents as a hex string. + Parameters + ---------- + os_path: str + The path to be read. + format: str + If 'text', the contents will be decoded as UTF-8. + If 'base64', the raw bytes contents will be encoded as base64. + If 'byte', the raw bytes contents will be returned. + If not specified, try to decode as UTF-8, and fall back to base64 + raw: bool + [Optional] If True, will return as third argument the raw bytes content + + Returns + ------- + (content, format, byte_content) It returns the content in the given format + as well as the raw byte content. """ if not os.path.isfile(os_path): raise HTTPError(400, "Cannot read non-file %s" % os_path) @@ -456,20 +512,22 @@ async def _read_file(self, os_path, format, require_hash=False): with self.open(os_path, "rb") as f: bcontent = await run_sync(f.read) - if require_hash: - hash_value = await self._get_hash_from_content(bcontent) - else: - hash_value = None - if format == "byte": # Not for http response but internal use - return bcontent, "byte", hash_value + return (bcontent, "byte", bcontent) if raw else (bcontent, "byte") if format is None or format == "text": # Try to interpret as unicode if format is unknown or if unicode # was explicitly requested. try: - return bcontent.decode("utf8"), "text", hash_value + return ( + (bcontent.decode("utf8"), "text", bcontent) + if raw + else ( + bcontent.decode("utf8"), + "text", + ) + ) except UnicodeError as e: if format == "text": raise HTTPError( @@ -477,7 +535,11 @@ async def _read_file(self, os_path, format, require_hash=False): "%s is not UTF-8 encoded" % os_path, reason="bad format", ) from e - return encodebytes(bcontent).decode("ascii"), "base64", hash_value + return ( + (encodebytes(bcontent).decode("ascii"), "base64", bcontent) + if raw + else (encodebytes(bcontent).decode("ascii"), "base64") + ) async def _save_file(self, os_path, content, format): """Save content of a generic file.""" @@ -497,12 +559,3 @@ async def _save_file(self, os_path, content, format): with self.atomic_writing(os_path, text=False) as f: await run_sync(f.write, bcontent) - - async def _get_hash_from_content(self, bcontent: bytes): - h = hashlib.new(self.hash_algorithm) - await run_sync(h.update, bcontent) - return h.hexdigest() - - async def _get_hash_from_file(self, os_path): - _, _, h = await self._read_file(os_path, "byte", require_hash=True) - return h diff --git a/jupyter_server/services/contents/filemanager.py b/jupyter_server/services/contents/filemanager.py index f01d22620a..301de58106 100644 --- a/jupyter_server/services/contents/filemanager.py +++ b/jupyter_server/services/contents/filemanager.py @@ -48,7 +48,6 @@ class FileContentsManager(FileManagerMixin, ContentsManager): root_dir = Unicode(config=True) max_copy_folder_size_mb = Int(500, config=True, help="The max folder size that can be copied") - support_hash = Bool(True, config=False, help="Support require_hash argument in `get`") @default("root_dir") def _default_root_dir(self): @@ -270,7 +269,7 @@ def _base_model(self, path): model["size"] = size model["writable"] = self.is_writable(path) model["hash"] = None - model["hash_algorithm"] = self.hash_algorithm + model["hash_algorithm"] = None return model @@ -356,11 +355,9 @@ def _file_model(self, path, content=True, format=None, require_hash=False): os_path = self._get_os_path(path) model["mimetype"] = mimetypes.guess_type(os_path)[0] - hash_value = None - if content or require_hash: - content, format, hash_value = self._read_file( - os_path, format, require_hash=require_hash - ) + bytes_content = None + if content: + content, format, bytes_content = self._read_file(os_path, format, raw=True) if model["mimetype"] is None: default_mime = { "text": "text/plain", @@ -371,9 +368,13 @@ def _file_model(self, path, content=True, format=None, require_hash=False): model.update( content=content, format=format, - hash=hash_value, ) + if require_hash: + if bytes_content is None: + bytes_content, _ = self._read_file(os_path, "byte") + model.update(**self._get_hash(bytes_content)) + return model def _notebook_model(self, path, content=True, require_hash=False): @@ -388,22 +389,25 @@ def _notebook_model(self, path, content=True, require_hash=False): model["type"] = "notebook" os_path = self._get_os_path(path) + bytes_content = None if content: validation_error: dict[str, t.Any] = {} - nb = self._read_notebook( - os_path, as_version=4, capture_validation_error=validation_error + nb, bytes_content = self._read_notebook( + os_path, as_version=4, capture_validation_error=validation_error, raw=True ) self.mark_trusted_cells(nb, path) model["content"] = nb model["format"] = "json" self.validate_notebook_model(model, validation_error) + if require_hash: - # FIXME: Here we may read file twice while content=True - model["hash"] = self._get_hash_from_file(os_path) + if bytes_content is None: + bytes_content, _ = self._read_file(os_path, "byte") + model.update(**self._get_hash(bytes_content)) return model - def get(self, path, content=True, type=None, format=None, require_hash=None): + def get(self, path, content=True, type=None, format=None, require_hash=False): """Takes a path for an entity and returns its model Parameters @@ -822,10 +826,9 @@ async def _file_model(self, path, content=True, format=None, require_hash=False) os_path = self._get_os_path(path) model["mimetype"] = mimetypes.guess_type(os_path)[0] - if content or require_hash: - content, format, hash_value = await self._read_file( - os_path, format, require_hash=require_hash - ) + bytes_content = None + if content: + content, format, bytes_content = await self._read_file(os_path, format, raw=True) if model["mimetype"] is None: default_mime = { "text": "text/plain", @@ -833,7 +836,15 @@ async def _file_model(self, path, content=True, format=None, require_hash=False) }[format] model["mimetype"] = default_mime - model.update(content=content, format=format, hash=hash_value) + model.update( + content=content, + format=format, + ) + + if require_hash: + if bytes_content is None: + bytes_content, _ = await self._read_file(os_path, "byte") + model.update(**self._get_hash(bytes_content)) return model @@ -847,18 +858,21 @@ async def _notebook_model(self, path, content=True, require_hash=False): model["type"] = "notebook" os_path = self._get_os_path(path) + bytes_content = None if content: validation_error: dict[str, t.Any] = {} - nb = await self._read_notebook( - os_path, as_version=4, capture_validation_error=validation_error + nb, bytes_content = await self._read_notebook( + os_path, as_version=4, capture_validation_error=validation_error, raw=True ) self.mark_trusted_cells(nb, path) model["content"] = nb model["format"] = "json" self.validate_notebook_model(model, validation_error) + if require_hash: - # FIXME: Here we may read file twice while content=True - model["hash"] = await self._get_hash_from_file(os_path) + if bytes_content is None: + bytes_content, _ = await self._read_file(os_path, "byte") + model.update(**(self._get_hash(bytes_content))) return model diff --git a/jupyter_server/services/contents/handlers.py b/jupyter_server/services/contents/handlers.py index cbd71a1b27..a7c7ffff17 100644 --- a/jupyter_server/services/contents/handlers.py +++ b/jupyter_server/services/contents/handlers.py @@ -23,20 +23,20 @@ AUTH_RESOURCE = "contents" -def _validate_in_or_not(expect_in_model: bool, model: Dict[str, Any], maybe_none_keys: List[str]): +def _validate_keys(expect_defined: bool, model: Dict[str, Any], keys: List[str]): """ - Validate that the keys in maybe_none_keys are None or not None + Validate that the keys are defined (i.e. not None) or not (i.e. None) """ - if expect_in_model: - errors = [key for key in maybe_none_keys if model[key] is None] + if expect_defined: + errors = [key for key in keys if model[key] is None] if errors: raise web.HTTPError( 500, f"Keys unexpectedly None: {errors}", ) else: - errors = {key: model[key] for key in maybe_none_keys if model[key] is not None} # type: ignore[assignment] + errors = {key: model[key] for key in keys if model[key] is not None} # type: ignore[assignment] if errors: raise web.HTTPError( 500, @@ -44,7 +44,7 @@ def _validate_in_or_not(expect_in_model: bool, model: Dict[str, Any], maybe_none ) -def validate_model(model, expect_content, expect_hash): +def validate_model(model, expect_content=False, expect_hash=False): """ Validate a model returned by a ContentsManager method. @@ -63,9 +63,9 @@ def validate_model(model, expect_content, expect_hash): "mimetype", "content", "format", - "hash", - "hash_algorithm", } + if expect_hash: + required_keys.update(["hash", "hash_algorithm"]) missing = required_keys - set(model.keys()) if missing: raise web.HTTPError( @@ -74,9 +74,9 @@ def validate_model(model, expect_content, expect_hash): ) content_keys = ["content", "format"] - hash_keys = ["hash"] - _validate_in_or_not(expect_content, model, content_keys) - _validate_in_or_not(expect_hash, model, hash_keys) + _validate_keys(expect_content, model, content_keys) + if expect_hash: + _validate_keys(expect_hash, model, ["hash", "hash_algorithm"]) class ContentsAPIHandler(APIHandler): @@ -139,28 +139,39 @@ async def get(self, path=""): hash_str = self.get_query_argument("hash", default="0") if hash_str not in {"0", "1"}: - raise web.HTTPError(400, "Content %r is invalid" % hash_str) - require_hash = int(hash_str or "") + raise web.HTTPError(400, f"Content {hash_str!r} is invalid") + require_hash = int(hash_str) if not cm.allow_hidden and await ensure_async(cm.is_hidden(path)): await self._finish_error( HTTPStatus.NOT_FOUND, f"file or directory {path!r} does not exist" ) - kwargs = { - "path": path, - "type": type, - "format": format, - "content": content, - } - - # See https://github.com/jupyter-server/jupyter_server/issues/1366 - # try not breaking `ContentManager.get` API while intro `hash` argument to ContentManager - if cm.support_hash: - kwargs["require_hash"] = require_hash try: - model = await ensure_async(self.contents_manager.get(**kwargs)) - validate_model(model, expect_content=content, expect_hash=require_hash) + expect_hash = require_hash + try: + model = await ensure_async( + self.contents_manager.get( + path=path, + type=type, + format=format, + content=content, + require_hash=require_hash, + ) + ) + except TypeError: + # Fallback for ContentsManager not handling the require_hash argument + # introduced in 2.11 + expect_hash = False + model = await ensure_async( + self.contents_manager.get( + path=path, + type=type, + format=format, + content=content, + ) + ) + validate_model(model, expect_content=content, expect_hash=expect_hash) self._finish_model(model, location=False) except web.HTTPError as exc: # 404 is okay in this context, catch exception and return 404 code to prevent stack trace on client @@ -190,7 +201,7 @@ async def patch(self, path=""): raise web.HTTPError(400, f"Cannot rename file or directory {path!r}") model = await ensure_async(cm.update(model, path)) - validate_model(model, expect_content=False, expect_hash=False) + validate_model(model) self._finish_model(model) async def _copy(self, copy_from, copy_to=None): @@ -203,7 +214,7 @@ async def _copy(self, copy_from, copy_to=None): ) model = await ensure_async(self.contents_manager.copy(copy_from, copy_to)) self.set_status(201) - validate_model(model, expect_content=False, expect_hash=False) + validate_model(model) self._finish_model(model) async def _upload(self, model, path): @@ -211,7 +222,7 @@ async def _upload(self, model, path): self.log.info("Uploading file to %s", path) model = await ensure_async(self.contents_manager.new(model, path)) self.set_status(201) - validate_model(model, expect_content=False, expect_hash=False) + validate_model(model) self._finish_model(model) async def _new_untitled(self, path, type="", ext=""): @@ -221,7 +232,7 @@ async def _new_untitled(self, path, type="", ext=""): self.contents_manager.new_untitled(path=path, type=type, ext=ext) ) self.set_status(201) - validate_model(model, expect_content=False, expect_hash=False) + validate_model(model) self._finish_model(model) async def _save(self, model, path): @@ -230,7 +241,7 @@ async def _save(self, model, path): if not chunk or chunk == -1: # Avoid tedious log information self.log.info("Saving file at %s", path) model = await ensure_async(self.contents_manager.save(model, path)) - validate_model(model, expect_content=False, expect_hash=False) + validate_model(model) self._finish_model(model) @web.authenticated diff --git a/jupyter_server/services/contents/manager.py b/jupyter_server/services/contents/manager.py index 1c0455c4c9..b12a2055ec 100644 --- a/jupyter_server/services/contents/manager.py +++ b/jupyter_server/services/contents/manager.py @@ -111,7 +111,6 @@ def _validate_preferred_dir(self, proposal): return value allow_hidden = Bool(False, config=True, help="Allow access to hidden files") - support_hash = Bool(False, config=False, help="Support require_hash argument in `get`") notary = Instance(sign.NotebookNotary) @@ -448,14 +447,15 @@ def exists(self, path): """ return self.file_exists(path) or self.dir_exists(path) - def get(self, path, content=True, type=None, format=None): - """ - Get a file or directory model. + def get(self, path, content=True, type=None, format=None, require_hash=False): + """Get a file or directory model. + + Parameters + ---------- + require_hash : bool + Whether the file hash must be returned or not. - Support for hash: - If a ContentManager supports calculating the hash value of a file, - `ContentManager.support_hash` should be True and this function will accept an `require_hash` parameter, - will return a dict with `hash` and `hash_algorithm` key. + *Changed in version 2.11*: The *require_hash* parameter was added. """ raise NotImplementedError @@ -857,14 +857,15 @@ async def exists(self, path): self.dir_exists(path) ) - async def get(self, path, content=True, type=None, format=None): - """ - Get a file or directory model. + async def get(self, path, content=True, type=None, format=None, require_hash=False): + """Get a file or directory model. + + Parameters + ---------- + require_hash : bool + Whether the file hash must be returned or not. - Support for hash: - If a ContentManager supports calculating the hash value of a file, - `ContentManager.support_hash` should be True and this function will accept an `require_hash` parameter, - will return a dict with `hash` and `hash_algorithm` key. + *Changed in version 2.11*: The *require_hash* parameter was added. """ raise NotImplementedError diff --git a/tests/services/contents/test_api.py b/tests/services/contents/test_api.py index abab48445a..b74ee8f62a 100644 --- a/tests/services/contents/test_api.py +++ b/tests/services/contents/test_api.py @@ -97,9 +97,8 @@ async def test_get_nb_contents(jp_fetch, contents, path, name): assert model["path"] == nbpath assert model["type"] == "notebook" assert "content" in model - assert "hash" in model - assert model["hash"] == None - assert "hash_algorithm" in model + assert model["hash"] is None + assert model["hash_algorithm"] is None assert model["format"] == "json" assert "metadata" in model["content"] assert isinstance(model["content"]["metadata"], dict) @@ -114,9 +113,8 @@ async def test_get_nb_hash(jp_fetch, contents, path, name): assert model["name"] == nbname assert model["path"] == nbpath assert model["type"] == "notebook" - assert "hash" in model assert model["hash"] - assert "hash_algorithm" in model + assert model["hash_algorithm"] assert "metadata" in model["content"] assert isinstance(model["content"]["metadata"], dict) @@ -221,7 +219,7 @@ async def test_get_text_file_hash(jp_fetch, contents, path, name): assert model["path"] == txtpath assert "hash" in model assert model["hash"] - assert "hash_algorithm" in model + assert model["hash_algorithm"] assert model["format"] == "text" assert model["type"] == "file" diff --git a/tests/services/contents/test_fileio.py b/tests/services/contents/test_fileio.py index c31769f13c..12752ee810 100644 --- a/tests/services/contents/test_fileio.py +++ b/tests/services/contents/test_fileio.py @@ -137,16 +137,16 @@ def test_path_to_invalid(tmpdir): @pytest.mark.skipif(os.name == "nt", reason="test fails on Windows") -def test_file_manager_mixin(tmpdir): +def test_file_manager_mixin(tmp_path): mixin = FileManagerMixin() mixin.log = logging.getLogger() - bad_content = tmpdir / "bad_content.ipynb" + bad_content = tmp_path / "bad_content.ipynb" bad_content.write_text("{}", "utf8") # Same as `echo -n {} | sha256sum` - assert ( - mixin._get_hash_from_file(bad_content) - == "44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a" - ) + assert mixin._get_hash(bad_content.read_bytes()) == { + "hash": "44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a", + "hash_algorithm": "sha256", + } with pytest.raises(HTTPError): mixin._read_notebook(bad_content) other = path_to_intermediate(bad_content) @@ -157,10 +157,10 @@ def test_file_manager_mixin(tmpdir): validate(nb) with pytest.raises(HTTPError): - mixin._read_file(tmpdir, "text") + mixin._read_file(tmp_path, "text") with pytest.raises(HTTPError): - mixin._save_file(tmpdir / "foo", "foo", "bar") + mixin._save_file(tmp_path / "foo", "foo", "bar") @pytest.mark.skipif(os.name == "nt", reason="test fails on Windows") @@ -169,18 +169,18 @@ async def test_async_file_manager_mixin(tmpdir): mixin.log = logging.getLogger() bad_content = tmpdir / "bad_content.ipynb" bad_content.write_text("{}", "utf8") - # Same as `echo -n {} | sha256sum` - assert ( - await mixin._get_hash_from_file(bad_content) - == "44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a" - ) with pytest.raises(HTTPError): await mixin._read_notebook(bad_content) other = path_to_intermediate(bad_content) with open(other, "w") as fid: json.dump(new_notebook(), fid) mixin.use_atomic_writing = True - nb = await mixin._read_notebook(bad_content) + nb, bcontent = await mixin._read_notebook(bad_content, raw=True) + # Same as `echo -n {} | sha256sum` + assert mixin._get_hash(bcontent) == { + "hash": "4747f9680816e352a697d0fb69d82334457cdd1e46f053e800859833d3e6003e", + "hash_algorithm": "sha256", + } validate(nb) with pytest.raises(HTTPError): @@ -188,3 +188,30 @@ async def test_async_file_manager_mixin(tmpdir): with pytest.raises(HTTPError): await mixin._save_file(tmpdir / "foo", "foo", "bar") + + +async def test_AsyncFileManagerMixin_read_notebook_no_raw(tmpdir): + mixin = AsyncFileManagerMixin() + mixin.log = logging.getLogger() + bad_content = tmpdir / "bad_content.ipynb" + bad_content.write_text("{}", "utf8") + + other = path_to_intermediate(bad_content) + with open(other, "w") as fid: + json.dump(new_notebook(), fid) + mixin.use_atomic_writing = True + answer = await mixin._read_notebook(bad_content) + + assert not isinstance(answer, tuple) + + +async def test_AsyncFileManagerMixin_read_file_no_raw(tmpdir): + mixin = AsyncFileManagerMixin() + mixin.log = logging.getLogger() + file_path = tmpdir / "bad_content.text" + file_path.write_text("blablabla", "utf8") + + mixin.use_atomic_writing = True + answer = await mixin._read_file(file_path, "text") + + assert len(answer) == 2 diff --git a/tests/services/contents/test_manager.py b/tests/services/contents/test_manager.py index 3c3b480132..e718036b0b 100644 --- a/tests/services/contents/test_manager.py +++ b/tests/services/contents/test_manager.py @@ -573,6 +573,14 @@ async def test_get(jp_contents_manager): nb_with_hash = await ensure_async(cm.get(path, require_hash=True)) assert nb_with_hash["hash"] + assert nb_with_hash["hash_algorithm"] + + # Get the hash without the content + nb_with_hash = await ensure_async(cm.get(path, content=False, require_hash=True)) + assert nb_with_hash["content"] is None + assert nb_with_hash["format"] is None + assert nb_with_hash["hash"] + assert nb_with_hash["hash_algorithm"] # Test in sub-directory sub_dir = "/foo/" @@ -597,13 +605,34 @@ async def test_get(jp_contents_manager): "path": "foo/untitled.txt", "type": "file", "writable": True, + "hash_algorithm": cm.hash_algorithm, + } + # Assert expected model is in file_model + for key, value in expected_model.items(): + assert file_model[key] == value + assert "created" in file_model + assert "last_modified" in file_model + assert file_model["hash"] + + # Get hash without content + file_model = await ensure_async(cm.get(file_model_path, content=False, require_hash=True)) + expected_model = { + "content": None, + "format": None, + "mimetype": "text/plain", + "name": "untitled.txt", + "path": "foo/untitled.txt", + "type": "file", + "writable": True, + "hash_algorithm": cm.hash_algorithm, } + # Assert expected model is in file_model for key, value in expected_model.items(): assert file_model[key] == value assert "created" in file_model assert "last_modified" in file_model - assert "hash" in file_model + assert file_model["hash"] # Create a sub-sub directory to test getting directory contents with a # subdir. diff --git a/tests/services/contents/test_manager_no_hash.py b/tests/services/contents/test_manager_no_hash.py new file mode 100644 index 0000000000..511a8d319b --- /dev/null +++ b/tests/services/contents/test_manager_no_hash.py @@ -0,0 +1,44 @@ +import json + +import pytest + +from jupyter_server.services.contents.filemanager import ( + AsyncFileContentsManager, +) + + +class NoHashFileManager(AsyncFileContentsManager): + """FileManager prior to 2.11 that introduce the ability to request file hash.""" + + def _base_model(self, path): + """Drop new attributes from model.""" + model = super()._base_model(path) + + del model["hash"] + del model["hash_algorithm"] + + return model + + async def get(self, path, content=True, type=None, format=None): + """Get without the new `require_hash` argument""" + model = await super().get(path, content=content, type=type, format=format) + return model + + +@pytest.fixture +def jp_server_config(jp_server_config): + jp_server_config["ServerApp"]["contents_manager_class"] = NoHashFileManager + return jp_server_config + + +async def test_manager_no_hash_support(tmp_path, jp_root_dir, jp_fetch): + # Create some content + path = "dummy.txt" + (jp_root_dir / path).write_text("blablabla", encoding="utf-8") + + response = await jp_fetch("api", "contents", path, method="GET", params=dict(hash="1")) + + model = json.loads(response.body) + + assert "hash" not in model + assert "hash_algorithm" not in model From 8380805d54707c586745b49e54fd87d227167e8a Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Fri, 24 Nov 2023 00:41:04 +0800 Subject: [PATCH 14/16] Reformat table --- docs/source/developers/contents.rst | 81 +++++++++++++++-------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/docs/source/developers/contents.rst b/docs/source/developers/contents.rst index f61e5d36fb..30dd267a31 100644 --- a/docs/source/developers/contents.rst +++ b/docs/source/developers/contents.rst @@ -33,46 +33,47 @@ which we refer to as **models**. Models may contain the following entries: -+--------------------+------------+-----------------------------------------+ -| Key | Type | Info | -+====================+============+=========================================+ -| **name** | unicode | Basename of the entity. | -+--------------------+------------+-----------------------------------------+ -| **path** | unicode | Full | -| | | (:ref:`API-style`) | -| | | path to the entity. | -+--------------------+------------+-----------------------------------------+ -| **type** | unicode | The entity type. One of | -| | | ``"notebook"``, ``"file"`` or | -| | | ``"directory"``. | -+--------------------+------------+-----------------------------------------+ -| **created** | datetime | Creation date of the entity. | -+--------------------+------------+-----------------------------------------+ -| **last_modified** | datetime | Last modified date of the | -| | | entity. | -+--------------------+------------+-----------------------------------------+ -| **content** | variable | The "content" of the entity. | -| | | (:ref:`See | -| | | Below`) | -+--------------------+------------+-----------------------------------------+ -| **mimetype** | unicode or | The mimetype of ``content``, | -| | ``None`` | if any. (:ref:`See | -| | | Below`) | -+--------------------+------------+-----------------------------------------+ -| **format** | unicode or | The format of ``content``, | -| | ``None`` | if any. (:ref:`See | -| | | Below`) | -+--------------------+------------+-----------------------------------------+ -| [optional] | | | -| **hash** | unicode or | The hash of the contents. | -| | ``None`` | It cannot be null if ``hash_algorithm`` | -| | | is defined. | -+--------------------+------------+-----------------------------------------+ -| [optional] | | | -| **hash_algorithm** | unicode or | The algorithm used to compute | -| | ``None`` | hash value. | -| | | It cannot be null if hash is defined. | -+--------------------+------------+-----------------------------------------+ ++--------------------+------------+--------------------------------+ +| Key | Type | Info | ++====================+============+================================+ +| **name** | unicode | Basename of the entity. | ++--------------------+------------+--------------------------------+ +| **path** | unicode | Full | +| | | (:ref:`API-style`) | +| | | path to the entity. | ++--------------------+------------+--------------------------------+ +| **type** | unicode | The entity type. One of | +| | | ``"notebook"``, ``"file"`` or | +| | | ``"directory"``. | ++--------------------+------------+--------------------------------+ +| **created** | datetime | Creation date of the entity. | ++--------------------+------------+--------------------------------+ +| **last_modified** | datetime | Last modified date of the | +| | | entity. | ++--------------------+------------+--------------------------------+ +| **content** | variable | The "content" of the entity. | +| | | (:ref:`See | +| | | Below`) | ++--------------------+------------+--------------------------------+ +| **mimetype** | unicode or | The mimetype of ``content``, | +| | ``None`` | if any. (:ref:`See | +| | | Below`) | ++--------------------+------------+--------------------------------+ +| **format** | unicode or | The format of ``content``, | +| | ``None`` | if any. (:ref:`See | +| | | Below`) | ++--------------------+------------+--------------------------------+ +| [optional] | | | +| **hash** | unicode or | The hash of the contents. | +| | ``None`` | It cannot be null if | +| | | ``hash_algorithm`` is defined. | ++--------------------+------------+--------------------------------+ +| [optional] | | | +| **hash_algorithm** | unicode or | The algorithm used to compute | +| | ``None`` | hash value. | +| | | It cannot be null | +| | | if ``hash`` is defined. | ++--------------------+------------+--------------------------------+ .. _modelcontent: From 9004e7f584421e30be72a6bdec0aaeb18f22fb0c Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Fri, 24 Nov 2023 00:42:59 +0800 Subject: [PATCH 15/16] Reformat the table --- docs/source/developers/contents.rst | 83 +++++++++++++++-------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/docs/source/developers/contents.rst b/docs/source/developers/contents.rst index 30dd267a31..6910535f30 100644 --- a/docs/source/developers/contents.rst +++ b/docs/source/developers/contents.rst @@ -33,47 +33,48 @@ which we refer to as **models**. Models may contain the following entries: -+--------------------+------------+--------------------------------+ -| Key | Type | Info | -+====================+============+================================+ -| **name** | unicode | Basename of the entity. | -+--------------------+------------+--------------------------------+ -| **path** | unicode | Full | -| | | (:ref:`API-style`) | -| | | path to the entity. | -+--------------------+------------+--------------------------------+ -| **type** | unicode | The entity type. One of | -| | | ``"notebook"``, ``"file"`` or | -| | | ``"directory"``. | -+--------------------+------------+--------------------------------+ -| **created** | datetime | Creation date of the entity. | -+--------------------+------------+--------------------------------+ -| **last_modified** | datetime | Last modified date of the | -| | | entity. | -+--------------------+------------+--------------------------------+ -| **content** | variable | The "content" of the entity. | -| | | (:ref:`See | -| | | Below`) | -+--------------------+------------+--------------------------------+ -| **mimetype** | unicode or | The mimetype of ``content``, | -| | ``None`` | if any. (:ref:`See | -| | | Below`) | -+--------------------+------------+--------------------------------+ -| **format** | unicode or | The format of ``content``, | -| | ``None`` | if any. (:ref:`See | -| | | Below`) | -+--------------------+------------+--------------------------------+ -| [optional] | | | -| **hash** | unicode or | The hash of the contents. | -| | ``None`` | It cannot be null if | -| | | ``hash_algorithm`` is defined. | -+--------------------+------------+--------------------------------+ -| [optional] | | | -| **hash_algorithm** | unicode or | The algorithm used to compute | -| | ``None`` | hash value. | -| | | It cannot be null | -| | | if ``hash`` is defined. | -+--------------------+------------+--------------------------------+ ++--------------------+------------+-------------------------------+ +| Key | Type | Info | ++====================+============+===============================+ +| **name** | unicode | Basename of the entity. | ++--------------------+------------+-------------------------------+ +| **path** | unicode | Full | +| | | (:ref:`API-style`) | +| | | path to the entity. | ++--------------------+------------+-------------------------------+ +| **type** | unicode | The entity type. One of | +| | | ``"notebook"``, ``"file"`` or | +| | | ``"directory"``. | ++--------------------+------------+-------------------------------+ +| **created** | datetime | Creation date of the entity. | ++--------------------+------------+-------------------------------+ +| **last_modified** | datetime | Last modified date of the | +| | | entity. | ++--------------------+------------+-------------------------------+ +| **content** | variable | The "content" of the entity. | +| | | (:ref:`See | +| | | Below`) | ++--------------------+------------+-------------------------------+ +| **mimetype** | unicode or | The mimetype of ``content``, | +| | ``None`` | if any. (:ref:`See | +| | | Below`) | ++--------------------+------------+-------------------------------+ +| **format** | unicode or | The format of ``content``, | +| | ``None`` | if any. (:ref:`See | +| | | Below`) | ++--------------------+------------+-------------------------------+ +| [optional] | | | +| **hash** | unicode or | The hash of the contents. | +| | ``None`` | It cannot be null if | +| | | ``hash_algorithm`` is | +| | | defined. | ++--------------------+------------+-------------------------------+ +| [optional] | | | +| **hash_algorithm** | unicode or | The algorithm used to compute | +| | ``None`` | hash value. | +| | | It cannot be null | +| | | if ``hash`` is defined. | ++--------------------+------------+-------------------------------+ .. _modelcontent: From 470691ad0432c8a05b0f0ed191cf0695d27a84ce Mon Sep 17 00:00:00 2001 From: Wh1isper <9573586@qq.com> Date: Fri, 24 Nov 2023 01:02:07 +0800 Subject: [PATCH 16/16] Fixing typing:test --- .../services/contents/filecheckpoints.py | 4 ++-- jupyter_server/services/contents/fileio.py | 16 +++++++-------- .../services/contents/filemanager.py | 20 +++++++++---------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/jupyter_server/services/contents/filecheckpoints.py b/jupyter_server/services/contents/filecheckpoints.py index f6d1ef44e7..522b3bbd01 100644 --- a/jupyter_server/services/contents/filecheckpoints.py +++ b/jupyter_server/services/contents/filecheckpoints.py @@ -252,7 +252,7 @@ def get_file_checkpoint(self, checkpoint_id, path): if not os.path.isfile(os_checkpoint_path): self.no_such_checkpoint(path, checkpoint_id) - content, format = self._read_file(os_checkpoint_path, format=None) + content, format = self._read_file(os_checkpoint_path, format=None) # type: ignore[misc] return { "type": "file", "content": content, @@ -318,7 +318,7 @@ async def get_file_checkpoint(self, checkpoint_id, path): if not os.path.isfile(os_checkpoint_path): self.no_such_checkpoint(path, checkpoint_id) - content, format = await self._read_file(os_checkpoint_path, format=None) + content, format = await self._read_file(os_checkpoint_path, format=None) # type: ignore[misc] return { "type": "file", "content": content, diff --git a/jupyter_server/services/contents/fileio.py b/jupyter_server/services/contents/fileio.py index a404ee91b3..45607944ce 100644 --- a/jupyter_server/services/contents/fileio.py +++ b/jupyter_server/services/contents/fileio.py @@ -196,7 +196,7 @@ class FileManagerMixin(LoggingConfigurable, Configurable): If set to False, the new notebook is written directly on the old one which could fail (eg: full filesystem or quota )""", ) - hash_algorithm = Enum( + hash_algorithm = Enum( # type: ignore[call-overload] hashlib.algorithms_available, default_value="sha256", config=True, @@ -287,7 +287,7 @@ def _read_notebook( capture_validation_error=capture_validation_error, ) - return (nb, answer[2]) if raw else nb + return (nb, answer[2]) if raw else nb # type:ignore[misc] except Exception as e: e_orig = e @@ -340,8 +340,8 @@ def _get_hash(self, byte_content: bytes) -> dict[str, str]: return {"hash": h.hexdigest(), "hash_algorithm": algorithm} def _read_file( - self, os_path: str, format: str, raw: bool = False - ) -> tuple[str, str] | tuple[str, str, bytes]: + self, os_path: str, format: str | None, raw: bool = False + ) -> tuple[str | bytes, str] | tuple[str | bytes, str, bytes]: """Read a non-notebook file. Parameters @@ -446,7 +446,7 @@ async def _read_notebook( ), answer[0], ) - return (nb, answer[2]) if raw else nb + return (nb, answer[2]) if raw else nb # type:ignore[misc] except Exception as e: e_orig = e @@ -484,9 +484,9 @@ async def _save_notebook(self, os_path, nb, capture_validation_error=None): f, ) - async def _read_file( - self, os_path: str, format: str, raw: bool = False - ) -> tuple[str, str] | tuple[str, str, bytes]: + async def _read_file( # type: ignore[override] + self, os_path: str, format: str | None, raw: bool = False + ) -> tuple[str | bytes, str] | tuple[str | bytes, str, bytes]: """Read a non-notebook file. Parameters diff --git a/jupyter_server/services/contents/filemanager.py b/jupyter_server/services/contents/filemanager.py index 301de58106..c56a1acc70 100644 --- a/jupyter_server/services/contents/filemanager.py +++ b/jupyter_server/services/contents/filemanager.py @@ -357,7 +357,7 @@ def _file_model(self, path, content=True, format=None, require_hash=False): bytes_content = None if content: - content, format, bytes_content = self._read_file(os_path, format, raw=True) + content, format, bytes_content = self._read_file(os_path, format, raw=True) # type: ignore[misc] if model["mimetype"] is None: default_mime = { "text": "text/plain", @@ -372,8 +372,8 @@ def _file_model(self, path, content=True, format=None, require_hash=False): if require_hash: if bytes_content is None: - bytes_content, _ = self._read_file(os_path, "byte") - model.update(**self._get_hash(bytes_content)) + bytes_content, _ = self._read_file(os_path, "byte") # type: ignore[assignment,misc] + model.update(**self._get_hash(bytes_content)) # type: ignore[arg-type] return model @@ -402,8 +402,8 @@ def _notebook_model(self, path, content=True, require_hash=False): if require_hash: if bytes_content is None: - bytes_content, _ = self._read_file(os_path, "byte") - model.update(**self._get_hash(bytes_content)) + bytes_content, _ = self._read_file(os_path, "byte") # type: ignore[misc] + model.update(**self._get_hash(bytes_content)) # type: ignore[arg-type] return model @@ -828,7 +828,7 @@ async def _file_model(self, path, content=True, format=None, require_hash=False) bytes_content = None if content: - content, format, bytes_content = await self._read_file(os_path, format, raw=True) + content, format, bytes_content = await self._read_file(os_path, format, raw=True) # type: ignore[misc] if model["mimetype"] is None: default_mime = { "text": "text/plain", @@ -843,8 +843,8 @@ async def _file_model(self, path, content=True, format=None, require_hash=False) if require_hash: if bytes_content is None: - bytes_content, _ = await self._read_file(os_path, "byte") - model.update(**self._get_hash(bytes_content)) + bytes_content, _ = await self._read_file(os_path, "byte") # type: ignore[assignment,misc] + model.update(**self._get_hash(bytes_content)) # type: ignore[arg-type] return model @@ -871,8 +871,8 @@ async def _notebook_model(self, path, content=True, require_hash=False): if require_hash: if bytes_content is None: - bytes_content, _ = await self._read_file(os_path, "byte") - model.update(**(self._get_hash(bytes_content))) + bytes_content, _ = await self._read_file(os_path, "byte") # type: ignore[misc] + model.update(**(self._get_hash(bytes_content))) # type: ignore[arg-type] return model