jupyter-server · blink1073 · Nov 24, 2023 · Nov 23, 2023 · Nov 23, 2023 · Nov 23, 2023
diff --git a/.github/workflows/downstream.yml b/.github/workflows/downstream.yml
@@ -107,6 +107,23 @@ jobs:
           test_command: pip install pytest-jupyter[server] && pytest -vv -raXxs -W default --durations 10 --color=yes
           package_name: jupyter_server_terminals
 
+  jupytext:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Base Setup
+        uses: jupyterlab/maintainer-tools/.github/actions/base-setup@v1
+
+      - name: Test jupytext
+        uses: jupyterlab/maintainer-tools/.github/actions/downstream-test@v1
+        with:
+          package_name: jupytext
+          test_command: pip install pytest-jupyter[server] gitpython pre-commit && python -m ipykernel install --name jupytext-dev --user && pytest -vv -raXxs -W default --durations 10 --color=yes --ignore=tests/test_doc_files_are_notebooks.py --ignore=tests/test_changelog.py
+
   downstream_check: # This job does nothing and is only used for the branch protection
     if: always()
     needs:
@@ -115,6 +132,7 @@ jobs:
       - jupyterlab_server
       - notebook
       - nbclassic
+      - jupytext
     runs-on: ubuntu-latest
     steps:
       - name: Decide whether the needed jobs succeeded or failed

diff --git a/docs/source/developers/contents.rst b/docs/source/developers/contents.rst
@@ -63,10 +63,14 @@ Models may contain the following entries:
 |                    |``None``   |if any. (:ref:`See            |
 |                    |           |Below<modelcontent>`)         |
 +--------------------+-----------+------------------------------+
-|**md5**             |unicode or |The md5 of the contents.      |
+|**hash**            |unicode or |The hash of the contents.     |
 |                    |``None``   |                              |
 |                    |           |                              |
 +--------------------+-----------+------------------------------+
+|**hash_algorithm**  |unicode    |The algorithm used to compute |
+|                    |           |hash value.                   |
+|                    |           |                              |
++--------------------+-----------+------------------------------+
 
 .. _modelcontent:
 
@@ -80,8 +84,9 @@ model. There are three model types: **notebook**, **file**, and **directory**.
       :class:`nbformat.notebooknode.NotebookNode` representing the .ipynb file
       represented by the model.  See the `NBFormat`_ documentation for a full
       description.
-    - The ``md5`` field a hexdigest string of the md5 value of the notebook
-      file.
+    - The ``hash`` field a hexdigest string of the hash value of the file.
+      If ``ContentManager.get`` not support hash, it should always be ``None``.
+    - ``hash_algorithm`` is the algorithm used to compute the hash value.
 
 - ``file`` models
     - The ``format`` field is either ``"text"`` or ``"base64"``.
@@ -91,14 +96,16 @@ model. There are three model types: **notebook**, **file**, and **directory**.
       file models, ``content`` simply contains the file's bytes after decoding
       as UTF-8.  Non-text (``base64``) files are read as bytes, base64 encoded,
       and then decoded as UTF-8.
-    - The ``md5`` field a hexdigest string of the md5 value of the file.
+    - The ``hash`` field a hexdigest string of the hash value of the file.
+      If ``ContentManager.get`` not support hash, it should always be ``None``.
+    - ``hash_algorithm`` is the algorithm used to compute the hash value.
 
 - ``directory`` models
     - The ``format`` field is always ``"json"``.
     - The ``mimetype`` field is always ``None``.
     - The ``content`` field contains a list of :ref:`content-free<contentfree>`
       models representing the entities in the directory.
-    - The ``md5`` field is always ``None``.
+    - The ``hash`` field is always ``None``.
 
 .. note::
 
@@ -137,7 +144,8 @@ model. There are three model types: **notebook**, **file**, and **directory**.
         "path": "foo/a.ipynb",
         "type": "notebook",
         "writable": True,
-        "md5": "7e47382b370c05a1b14706a2a8aff91a",
+        "hash": "f5e43a0b1c2e7836ab3b4d6b1c35c19e2558688de15a6a14e137a59e4715d34b",
+        "hash_algorithm": "sha256",
     }
 
     # Notebook Model without Content

diff --git a/jupyter_server/services/api/api.yaml b/jupyter_server/services/api/api.yaml
@@ -106,9 +106,9 @@ paths:
           in: query
           description: "Return content (0 for no content, 1 for return content)"
           type: integer
-        - name: md5
+        - name: hash
           in: query
-          description: "Return md5 hexdigest string of content (0 for no md5, 1 for return md5)"
+          description: "Return hash hexdigest string of content (0 for no hash, 1 for return hash), when ContentsManager not support hash, it will be ignored."
           type: integer
       responses:
         404:
@@ -889,7 +889,7 @@ definitions:
       kernel:
         $ref: "#/definitions/Kernel"
   Contents:
-    description: "A contents object.  The content and format keys may be null if content is not contained. The md5 maybe null if md5 is not contained.  If type is 'file', then the mimetype will be null."
+    description: "A contents object.  The content and format keys may be null if content is not contained. The hash maybe null if hash is not required.  If type is 'file', then the mimetype will be null."
     type: object
     required:
       - type
@@ -901,7 +901,8 @@ definitions:
       - mimetype
       - format
       - content
-      - md5
+      - hash
+      - hash_algorithm
     properties:
       name:
         type: string
@@ -939,9 +940,12 @@ definitions:
       format:
         type: string
         description: Format of content (one of null, 'text', 'base64', 'json')
-      md5:
+      hash:
         type: string
-        description: "The md5 hexdigest string of content, if requested (otherwise null)."
+        description: "The hexdigest hash string of content, if requested (otherwise null)."
+      hash_algorithm:
+        type: string
+        ddescription: "The algorithm used to produce the hash."
   Checkpoints:
     description: A checkpoint object.
     type: object

diff --git a/jupyter_server/services/contents/filecheckpoints.py b/jupyter_server/services/contents/filecheckpoints.py
@@ -252,7 +252,7 @@ def get_file_checkpoint(self, checkpoint_id, path):
         if not os.path.isfile(os_checkpoint_path):
             self.no_such_checkpoint(path, checkpoint_id)
 
-        content, format = self._read_file(os_checkpoint_path, format=None)
+        content, format, _ = self._read_file(os_checkpoint_path, format=None)
         return {
             "type": "file",
             "content": content,
@@ -318,7 +318,7 @@ async def get_file_checkpoint(self, checkpoint_id, path):
         if not os.path.isfile(os_checkpoint_path):
             self.no_such_checkpoint(path, checkpoint_id)
 
-        content, format = await self._read_file(os_checkpoint_path, format=None)
+        content, format, _ = await self._read_file(os_checkpoint_path, format=None)
         return {
             "type": "file",
             "content": content,

diff --git a/jupyter_server/services/contents/fileio.py b/jupyter_server/services/contents/fileio.py
@@ -14,7 +14,7 @@
 import nbformat
 from anyio.to_thread import run_sync
 from tornado.web import HTTPError
-from traitlets import Bool
+from traitlets import Bool, Unicode
 from traitlets.config import Configurable
 from traitlets.config.configurable import LoggingConfigurable
 
@@ -193,6 +193,12 @@ class FileManagerMixin(LoggingConfigurable, Configurable):
       If set to False, the new notebook is written directly on the old one which could fail (eg: full filesystem or quota )""",
     )
 
+    hash_algorithm = Unicode(
+        "sha256",
+        config=True,
+        help="hash algorithm to use for file content, support by hashlib",
+    )
+
     @contextmanager
     def open(self, os_path, *args, **kwargs):
         """wrapper around io.open that turns permission errors into 403"""
@@ -305,7 +311,16 @@ def _save_notebook(self, os_path, nb, capture_validation_error=None):
                 capture_validation_error=capture_validation_error,
             )
 
-    def _read_file(self, os_path, format):
+    def _get_hash_from_file(self, os_path):
+        _, _, h = self._read_file(os_path, "byte", require_hash=True)
+        return h
+
+    def _get_hash_from_content(self, bcontent: bytes):
+        h = hashlib.new(self.hash_algorithm)
+        h.update(bcontent)
+        return h.hexdigest()
+
+    def _read_file(self, os_path, format, require_hash=False):
         """Read a non-notebook file.
 
         os_path: The path to be read.
@@ -314,29 +329,35 @@ def _read_file(self, os_path, format):
           If 'base64', the raw bytes contents will be encoded as base64.
           If 'byte', the raw bytes contents will be returned.
           If not specified, try to decode as UTF-8, and fall back to base64
+        hash:
+          If require_hash is 'True', return the hash of the file contents as a hex string.
         """
         if not os.path.isfile(os_path):
             raise HTTPError(400, "Cannot read non-file %s" % os_path)
 
         with self.open(os_path, "rb") as f:
             bcontent = f.read()
+
+        # Calculate hash value if required
+        hash_value = self._get_hash_from_content(bcontent) if require_hash else None
+
         if format == "byte":
             # Not for http response but internal use
-            return bcontent, "byte"
+            return bcontent, "byte", hash_value
 
         if format is None or format == "text":
             # Try to interpret as unicode if format is unknown or if unicode
             # was explicitly requested.
             try:
-                return bcontent.decode("utf8"), "text"
+                return bcontent.decode("utf8"), "text", hash_value
             except UnicodeError as e:
                 if format == "text":
                     raise HTTPError(
                         400,
                         "%s is not UTF-8 encoded" % os_path,
                         reason="bad format",
                     ) from e
-        return encodebytes(bcontent).decode("ascii"), "base64"
+        return encodebytes(bcontent).decode("ascii"), "base64", hash_value
 
     def _save_file(self, os_path, content, format):
         """Save content of a generic file."""
@@ -357,12 +378,6 @@ def _save_file(self, os_path, content, format):
         with self.atomic_writing(os_path, text=False) as f:
             f.write(bcontent)
 
-    def _get_md5(self, os_path):
-        c, _ = self._read_file(os_path, "byte")
-        md5 = hashlib.md5()
-        md5.update(c)
-        return md5.hexdigest()
-
 
 class AsyncFileManagerMixin(FileManagerMixin):
     """
@@ -423,7 +438,7 @@ async def _save_notebook(self, os_path, nb, capture_validation_error=None):
                 f,
             )
 
-    async def _read_file(self, os_path, format):
+    async def _read_file(self, os_path, format, require_hash=False):
         """Read a non-notebook file.
 
         os_path: The path to be read.
@@ -432,29 +447,37 @@ async def _read_file(self, os_path, format):
           If 'base64', the raw bytes contents will be encoded as base64.
           If 'byte', the raw bytes contents will be returned.
           If not specified, try to decode as UTF-8, and fall back to base64
+        hash:
+          If require_hash is 'True', return the hash of the file contents as a hex string.
         """
         if not os.path.isfile(os_path):
             raise HTTPError(400, "Cannot read non-file %s" % os_path)
 
         with self.open(os_path, "rb") as f:
             bcontent = await run_sync(f.read)
+
+        if require_hash:
+            hash_value = await self._get_hash_from_content(bcontent)
+        else:
+            hash_value = None
+
         if format == "byte":
             # Not for http response but internal use
-            return bcontent, "byte"
+            return bcontent, "byte", hash_value
 
         if format is None or format == "text":
             # Try to interpret as unicode if format is unknown or if unicode
             # was explicitly requested.
             try:
-                return bcontent.decode("utf8"), "text"
+                return bcontent.decode("utf8"), "text", hash_value
             except UnicodeError as e:
                 if format == "text":
                     raise HTTPError(
                         400,
                         "%s is not UTF-8 encoded" % os_path,
                         reason="bad format",
                     ) from e
-        return encodebytes(bcontent).decode("ascii"), "base64"
+        return encodebytes(bcontent).decode("ascii"), "base64", hash_value
 
     async def _save_file(self, os_path, content, format):
         """Save content of a generic file."""
@@ -475,8 +498,11 @@ async def _save_file(self, os_path, content, format):
         with self.atomic_writing(os_path, text=False) as f:
             await run_sync(f.write, bcontent)
 
-    async def _get_md5(self, os_path):
-        c, _ = await self._read_file(os_path, "byte")
-        md5 = hashlib.md5()
-        await run_sync(md5.update, c)
-        return md5.hexdigest()
+    async def _get_hash_from_content(self, bcontent: bytes):
+        h = hashlib.new(self.hash_algorithm)
+        await run_sync(h.update, bcontent)
+        return h.hexdigest()
+
+    async def _get_hash_from_file(self, os_path):
+        _, _, h = await self._read_file(os_path, "byte", require_hash=True)
+        return h