From 76cfac8164c66c2ac50ef4c5aa02b1340e70b7e9 Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Tue, 19 Nov 2024 10:24:25 -0800 Subject: [PATCH] Add a higher level API. (#323) * Add a higher level API. Currently, this only supports Sigstore (and only DSSE on verification). I want to push this now so that people can start playing with it while I update the Colab demo and try to make the API more uniform and add support for the missing cases. With this, the default signing should be as simple as ``` from model_signing import api as model_signing_api model_signing_api.sign(model_path, signature_path) ``` And verification would be as simple as: ``` from model_signing import api as model_signing_api model_signing_api.verify( model_path, signature_path, expected_identity, expected_oidc_provider ) ``` Will think on how to simplify this even further.. Signed-off-by: Mihai Maruseac * Handle review comments Signed-off-by: Mihai Maruseac --------- Signed-off-by: Mihai Maruseac --- src/model_signing/__init__.py | 2 + src/model_signing/api.py | 582 ++++++++++++++++++++++++++++++++++ 2 files changed, 584 insertions(+) create mode 100644 src/model_signing/api.py diff --git a/src/model_signing/__init__.py b/src/model_signing/__init__.py index a7ff25f3..b8e8685d 100644 --- a/src/model_signing/__init__.py +++ b/src/model_signing/__init__.py @@ -12,4 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""For the stable high-level API, see model_signing.api.""" + __version__ = "0.0.2-alpha" diff --git a/src/model_signing/api.py b/src/model_signing/api.py new file mode 100644 index 00000000..5def0bb7 --- /dev/null +++ b/src/model_signing/api.py @@ -0,0 +1,582 @@ +# Copyright 2024 The Sigstore Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Public, high-level API for the model_signing library. + +Users should use this API to sign models and verify the model integrity instead +of reaching out to the internals of the library. We guarantee backwards +compatibility only for the API defined in this file. +""" + +from collections.abc import Callable, Iterable +import os +import pathlib +import sys +from typing import Literal, cast + +from model_signing.hashing import file +from model_signing.hashing import hashing +from model_signing.hashing import memory +from model_signing.manifest import manifest +from model_signing.serialization import serialize_by_file +from model_signing.serialization import serialize_by_file_shard +from model_signing.signing import in_toto +from model_signing.signing import signing +from model_signing.signing import sigstore + + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + + +def hash(model_path: os.PathLike) -> manifest.Manifest: + """Hashes a model using the default configuration. + + We use a separate method and configuration for hashing as it needs to be + common between signing and signature verification. Having thise separate + also helps with performance testing, as hashing is expected to take the + largest amount of time (proportional to model size). + + Since we need to be flexible on the serialization format, this returns a + manifest, instead of just a single digest. The type of returned manifest + depends on the configuration. + + Args: + model_path: the path to the model to hash. + + Returns: + A manifest of the hashed model. + """ + return HashingConfig().hash(model_path) + + +def sign(model_path: os.PathLike, signature_path: os.PathLike): + """Signs a model using the default configuration. + + Args: + model_path: the path to the model to sign. + signature_path: the path of the resulting signature. + """ + SigningConfig().sign(model_path, signature_path) + + +def verify( + model_path: os.PathLike, + signature_path: os.PathLike, + *, + identity: str, + oidc_issuer: str | None = None, + use_staging: bool = False, +): + """Verifies that a model conforms to a signature. + + Currently, this assumes signatures over DSSE, using Sigstore. We will add + support for more cases in a future change. + + Args: + model_path: the path to the model to verify. + signature_path: the path to the signature to check. + identity: The expected identity that has signed the model. + oidc_issuer: The expected OpenID Connect issuer that provided the + certificate used for the signature. + use_staging: Use staging configurations, instead of production. This + is supposed to be set to True only when testing. Default is False. + """ + VerificationConfig().set_sigstore_dsse_verifier( + identity=identity, oidc_issuer=oidc_issuer, use_staging=use_staging + ).verify(model_path, signature_path) + + +class HashingConfig: + """Configuration to use when hashing models. + + Hashing a model results in a `manifest.Manifest` object. This may contain a + single digest for the entire model, or be a pairing between model components + (e.g., files, file shards, etc.) and their corresponding hash. + + This configuration class allows selecting the serialization method to + generate the desired manifest format. + + This configuration class also allows configuring files from within the model + directory that should be ignored. These are files that doesn't impact the + behavior of the model, or files that won't be distributed with the model. + + Note that currently this configuration class only supports the main options + provided by the library. For more granular choices, usage of the lower level + APIs is recommended. + """ + + def __init__(self): + """Initializes the default configuration for hashing. + + The default hashing configuration uses SHA256 to compute the digest of + every file in the model. The resulting manifest is a listing of files + paired with their hashes. By default, no file is ignored and any + symbolic link in the model directory results in an error. + """ + self._ignored_paths = frozenset() + self._serializer = serialize_by_file.ManifestSerializer( + self._build_file_hasher_factory(), allow_symlinks=False + ) + + def hash(self, model_path: os.PathLike) -> manifest.Manifest: + """Hashes a model using the current configuration.""" + return self._serializer.serialize( + pathlib.Path(model_path), ignore_paths=self._ignored_paths + ) + + def _build_stream_hasher( + self, hashing_algorithm: Literal["sha256", "blake2"] = "sha256" + ) -> hashing.StreamingHashEngine: + """Builds a streaming hasher from a constant string. + + Args: + hashing_algorithm: the hashing algorithm to use. + + Returns: + An instance of the requested hasher. + """ + match hashing_algorithm: + case "sha256": + return memory.SHA256() + case "blake2": + return memory.BLAKE2() + case _: + raise ValueError( + f"Unsupported hashing method {hashing_algorithm}" + ) + + def _build_file_hasher_factory( + self, + hashing_algorithm: Literal["sha256", "blake2"] = "sha256", + chunk_size: int = 8192, + ) -> Callable[[pathlib.Path], file.SimpleFileHasher]: + """Builds the hasher factory for a serialization by file. + + Args: + hashing_algorithm: the hashing algorithm to use to hash a file + chunk_size: The amount of file to read at once. Default is 8KB. A + special value of 0 signals to attempt to read everything in a + single call. + + Returns: + The hasher factory that should be used by the active serialization + method. + """ + algorithm = self._build_stream_hasher(hashing_algorithm) + + def factory(path: pathlib.Path) -> file.SimpleFileHasher: + return file.SimpleFileHasher(path, algorithm, chunk_size=chunk_size) + + return factory + + def _build_sharded_file_hasher_factory( + self, + hashing_algorithm: Literal["sha256", "blake2"] = "sha256", + chunk_size: int = 8192, + shard_size: int = 1000000, + ) -> Callable[[pathlib.Path, int, int], file.ShardedFileHasher]: + """Builds the hasher factory for a serialization by file shards. + + Args: + hashing_algorithm: the hashing algorithm to use to hash a file + chunk_size: The amount of file to read at once. Default is 8KB. A + special value of 0 signals to attempt to read everything in a + single call. + shard_size: The size of a file shard. Default is 1,000,000 bytes. + + Returns: + The hasher factory that should be used by the active serialization + method. + """ + algorithm = self._build_stream_hasher(hashing_algorithm) + + def factory( + path: pathlib.Path, start: int, end: int + ) -> file.ShardedFileHasher: + return file.ShardedFileHasher( + path, + algorithm, + start=start, + end=end, + chunk_size=chunk_size, + shard_size=shard_size, + ) + + return factory + + def set_serialize_by_file_to_manifest( + self, + *, + hashing_algorithm: Literal["sha256", "blake2"] = "sha256", + chunk_size: int = 8192, + max_workers: int | None = None, + allow_symlinks: bool = False, + ) -> Self: + """Configures serialization to a manifest pairing files with hashes. + + The serialization method in this configuration is changed to one where + every file in the model is paired with its digest and a manifest + containing all these pairings is being returned. + + Args: + hashing_algorithm: the hashing algorithm to use to hash a file + chunk_size: The amount of file to read at once. Default is 8KB. A + special value of 0 signals to attempt to read everything in a + single call. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurrent.futures` library. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. + + Returns: + The new hashing configuration with the new serialization method. + """ + self._serializer = serialize_by_file.ManifestSerializer( + self._build_file_hasher_factory(hashing_algorithm, chunk_size), + max_workers=max_workers, + allow_symlinks=allow_symlinks, + ) + return self + + def set_serialize_by_file_to_digest( + self, + *, + hashing_algorithm: Literal["sha256", "blake2"] = "sha256", + merge_algorithm: Literal["sha256", "blake2"] = "sha256", + chunk_size: int = 8192, + allow_symlinks: bool = False, + ) -> Self: + """Configures serialization to a single digest, at file granularity. + + The serialization method in this configuration is changed to one where + every file in the model is paired with its digest and then a single + digest is computed over this pairing. + + Args: + hashing_algorithm: the hashing algorithm to use to hash a file + merge_algorithm: the hashing algorithm to use when computing the + final digest over all the (file, digest) pairings + chunk_size: The amount of file to read at once. Default is 8KB. A + special value of 0 signals to attempt to read everything in a + single call. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. + + Returns: + The new hashing configuration with the new serialization method. + """ + # TODO: https://github.com/sigstore/model-transparency/issues/197 - + # Because the API for this case is different than the other ones, we + # have to perform additional steps here. + file_hasher = cast( + file.SimpleFileHasher, + self._build_file_hasher_factory( + hashing_algorithm, chunk_size=chunk_size + )(pathlib.Path("unused")), + ) + merge_hasher = self._build_stream_hasher(merge_algorithm).__class__ + self._serializer = serialize_by_file.DigestSerializer( + file_hasher, merge_hasher, allow_symlinks=allow_symlinks + ) + return self + + def set_serialize_by_file_shard_to_manifest( + self, + *, + hashing_algorithm: Literal["sha256", "blake2"] = "sha256", + chunk_size: int = 8192, + shard_size: int = 1000000, + max_workers: int | None = None, + allow_symlinks: bool = False, + ) -> Self: + """Configures serialization to a manifest of (file shard, hash) pairs. + + The serialization method in this configuration is changed to one where + every file in the model is sharded in equal sized shards and every shard + is paired with its digest and a manifest containing all these pairings + is being returned. + + Args: + hashing_algorithm: the hashing algorithm to use to hash a file shard + chunk_size: The amount of file to read at once. Default is 8KB. A + special value of 0 signals to attempt to read everything in a + single call. + shard_size: The size of a file shard. Default is 1,000,000 bytes. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurrent.futures` library. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. + + Returns: + The new hashing configuration with the new serialization method. + """ + self._serializer = serialize_by_file_shard.ManifestSerializer( + self._build_sharded_file_hasher_factory( + hashing_algorithm, chunk_size, shard_size + ), + max_workers=max_workers, + allow_symlinks=allow_symlinks, + ) + return self + + def set_serialize_by_file_shard_to_digest( + self, + *, + hashing_algorithm: Literal["sha256", "blake2"] = "sha256", + merge_algorithm: Literal["sha256", "blake2"] = "sha256", + chunk_size: int = 8192, + shard_size: int = 1000000, + max_workers: int | None = None, + allow_symlinks: bool = False, + ) -> Self: + """Configures serialization to a single digest, at shard granularity. + + The serialization method in this configuration is changed to one where + every file shard in the model is paired with its digest and then a + single digest is computed over all entries in this pairing. + + Args: + hashing_algorithm: the hashing algorithm to use to hash a file shard + merge_algorithm: the hashing algorithm to use when computing the + final digest over all the (file, digest) pairings + chunk_size: The amount of file to read at once. Default is 8KB. A + special value of 0 signals to attempt to read everything in a + single call. + shard_size: The size of a file shard. Default is 1,000,000 bytes. + max_workers: Maximum number of workers to use in parallel. Default + is to defer to the `concurrent.futures` library. + allow_symlinks: Controls whether symbolic links are included. If a + symlink is present but the flag is `False` (default) the + serialization would raise an error. + + Returns: + The new hashing configuration with the new serialization method. + """ + merge_hasher = self._build_stream_hasher(merge_algorithm) + self._serializer = serialize_by_file_shard.DigestSerializer( + self._build_sharded_file_hasher_factory( + hashing_algorithm, chunk_size, shard_size + ), + merge_hasher, + max_workers=max_workers, + allow_symlinks=allow_symlinks, + ) + return self + + def set_ignored_paths(self, paths: Iterable[os.PathLike]) -> Self: + """Configures the paths to be ignored during serialization of a model. + + If the model is a single file, there are no paths that are ignored. If + the model is a directory, all paths must be within the model directory. + If a path to be ignored is absolute, we convert it to a path within the + model directory during serialization. If the path is relative, it is + assumed to be relative to the model root. + + If a path is a directory, serialization will ignore both the path and + any of its children. + + Args: + paths: the paths to ignore + + Returns: + The new hashing configuration with a new set of ignored paths. + """ + self._ignored_paths = frozenset({pathlib.Path(p) for p in paths}) + return self + + +class SigningConfig: + """Configuration to use when signing models. + + The signing configuration is used to decouple between serialization formats + and signing types. This configuration class allows setting up the + serialization format, the method to convert a `manifest.Manifest` to a + signing payload and the engine used for signing (currently, only supporting + Sigstore at this level). + """ + + def __init__(self): + """Initializes the default configuration for signing.""" + self._hashing_config = HashingConfig() + self._payload_generator = in_toto.DigestsIntotoPayload.from_manifest + self._signer = sigstore.SigstoreDSSESigner( + use_ambient_credentials=False, use_staging=False + ) + + def sign(self, model_path: os.PathLike, signature_path: os.PathLike): + """Signs a model using the current configuration. + + Args: + model_path: the path to the model to sign. + signature_path: the path of the resulting signature. + """ + manifest = self._hashing_config.hash(model_path) + payload = self._payload_generator(manifest) + signature = self._signer.sign(payload) + signature.write(pathlib.Path(signature_path)) + + def set_hashing_config(self, hashing_config: HashingConfig) -> Self: + """Sets the new configuration for hashing models. + + Args: + hashing_config: the new hashing configuration. + + Returns: + The new signing configuration. + """ + self._hashing_config = hashing_config + return self + + def set_payload_generator( + self, generator: Callable[[manifest.Manifest], signing.SigningPayload] + ) -> Self: + """Sets the conversion from manifest to signing payload. + + Since we want to support multiple serialization formats and multiple + signing solutions, we use a payload generator to relax the coupling + between the two. + + Args: + generator: the conversion function from a `manifest.Manifest` to a + `signing.SigningPayload` payload. + + Return: + The new signing configuration. + """ + self._payload_generator = generator + return self + + def set_sigstore_signer( + self, + *, + sign_dsse: bool = True, + oidc_issuer: str | None = None, + use_ambient_credentials: bool = True, + use_staging: bool = False, + identity_token: str | None = None, + ) -> Self: + """Configures the signing to be performed with Sigstore. + + Only one signer can be configured. Currently, we only support Sigstore + in the API, but the CLI supports signing with PKI, BYOK and no signing. + We will merge the configurations in a subsequent change. + + Args: + sign_dsse: Sign a DSSE statement (if True) or a binary blob. + oidc_issuer: An optional OpenID Connect issuer to use instead of the + default production one. Only relevant if `use_staging = False`. + Default is empty, relying on the Sigstore configuration. + use_ambient_credentials: Use ambient credentials (also known as + Workload Identity). Default is True. If ambient credentials cannot + be used (not available, or option disabled), a flow to get signer + identity via OIDC will start. + use_staging: Use staging configurations, instead of production. This + is supposed to be set to True only when testing. Default is False. + identity_token: An explicit identity token to use when signing, + taking precedence over any ambient credential or OAuth workflow. + + Return: + The new signing configuration. + """ + if sign_dsse: + signer_factory = sigstore.SigstoreDSSESigner + else: + signer_factory = sigstore.SigstoreArtifactSigner + + self._signer = signer_factory( + oidc_issuer=oidc_issuer, + use_ambient_credentials=use_ambient_credentials, + use_staging=use_staging, + identity_token=identity_token, + ) + return self + + +class VerificationConfig: + """Configuration to use when verifying models against signatures. + + The signing configuration is used to decouple between serialization formats + and signing types. This configuration class allows setting up the + serialization format, the method to convert a `manifest.Manifest` to a + signing payload and the engine used for signing (currently, only supporting + Sigstore at this level). + """ + + def __init__(self): + """Initializes the default configuration for verification.""" + self._hashing_config = HashingConfig() + self._verifier = None + + def verify(self, model_path: os.PathLike, signature_path: os.PathLike): + """Verifies that a model conforms to a signature. + + Args: + model_path: the path to the model to verify. + signature_path: the path to the signature to check. + """ + signature = sigstore.SigstoreSignature.read( + pathlib.Path(signature_path) + ) + expected_manifest = self._verifier.verify(signature) + actual_manifest = self._hashing_config.hash(model_path) + + if actual_manifest != expected_manifest: + raise ValueError("Signature mismatch") + + def set_hashing_config(self, hashing_config: HashingConfig) -> Self: + """Sets the new configuration for hashing models. + + Args: + hashing_config: the new hashing configuration. + + Returns: + The new signing configuration. + """ + self._hashing_config = hashing_config + return self + + def set_sigstore_dsse_verifier( + self, + *, + identity: str, + oidc_issuer: str | None = None, + use_staging: bool = False, + ) -> Self: + """Configures the verification of a Sigstore signature over DSSE. + + Only one verifier can be configured. Currently, we only support Sigstore + in the API, but the CLI supports signing with PKI, BYOK and no + signing/verification. We will merge the configurations in a subsequent + change. + + Args: + identity: The expected identity that has signed the model. + oidc_issuer: The expected OpenID Connect issuer that provided the + certificate used for the signature. + use_staging: Use staging configurations, instead of production. This + is supposed to be set to True only when testing. Default is False. + + Return: + The new verification configuration. + """ + self._verifier = sigstore.SigstoreDSSEVerifier( + identity=identity, oidc_issuer=oidc_issuer, use_staging=use_staging + ) + return self