diff --git a/MODULE.bazel b/MODULE.bazel index e89b8ef67..016704601 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -4,7 +4,7 @@ module( compatibility_level = 1, ) -bazel_dep(name = "bazel_features", version = "1.1.1") +bazel_dep(name = "bazel_features", version = "1.2.0") bazel_dep(name = "bazel_skylib", version = "1.3.0") bazel_dep(name = "platforms", version = "0.0.4") @@ -50,6 +50,17 @@ python.toolchain( ) use_repo(python, "pythons_hub") +pip = use_extension("//python/extensions:pip.bzl", "pip") + +# use the following repo, which contains references to things we download from +# PyPI via the experimental rules, this makes it possible to pass regular labels +# to the whl_library rule instantiation within the pip extension itself. +# +# However, I am not sure how this plays with `dev_dependency = True` and/or +# `isolated = True`. It could be that it might fail. +# TODO @aignas 2023-12-24: test this. +use_repo(pip, "pypi_whl") + # This call registers the Python toolchains. register_toolchains("@pythons_hub//:all") diff --git a/examples/bzlmod/.gitignore b/examples/bzlmod/.gitignore index ac51a054d..0d4fed27c 100644 --- a/examples/bzlmod/.gitignore +++ b/examples/bzlmod/.gitignore @@ -1 +1,2 @@ bazel-* +MODULE.bazel.lock diff --git a/examples/bzlmod/MODULE.bazel b/examples/bzlmod/MODULE.bazel index e49b586fe..979f74875 100644 --- a/examples/bzlmod/MODULE.bazel +++ b/examples/bzlmod/MODULE.bazel @@ -51,6 +51,15 @@ use_repo(python, "python_3_10", "python_3_9", "python_versions") # You are able to set a hub name, so that you can have different modifications of the same # wheel in different pip hubs. pip = use_extension("@rules_python//python/extensions:pip.bzl", "pip") +pip.experimental_target_platforms( + enabled = True, + # Specify PyPI indexes in the order of preference. If the package is not found in the first + # index, it will be looked up in the next one, which means that it makes it useful to order + # the index URLs by the number of packages it contains. + extra_index_urls = [], + # Specify the main PyPI index URL to use. This is value the default. + index_url = "https://pypi.org/simple", +) # Call whl_mods.create for the requests package. pip.whl_mods( diff --git a/examples/bzlmod/whl_mods/appended_build_content.BUILD b/examples/bzlmod/whl_mods/appended_build_content.BUILD index 0ca118d7b..9d9f2cd2b 100644 --- a/examples/bzlmod/whl_mods/appended_build_content.BUILD +++ b/examples/bzlmod/whl_mods/appended_build_content.BUILD @@ -8,9 +8,5 @@ write_file( filegroup( name = "whl_orig", - srcs = glob( - ["*.whl"], - allow_empty = False, - exclude = ["*-patched-*.whl"], - ), + srcs = ["_whl"], ) diff --git a/python/pip_install/pip_repository.bzl b/python/pip_install/pip_repository.bzl index fe58472f5..e9e9d78e5 100644 --- a/python/pip_install/pip_repository.bzl +++ b/python/pip_install/pip_repository.bzl @@ -713,18 +713,37 @@ def _whl_library_impl(rctx): # Manually construct the PYTHONPATH since we cannot use the toolchain here environment = _create_repository_execution_environment(rctx, python_interpreter) - result = rctx.execute( - args, - environment = environment, - quiet = rctx.attr.quiet, - timeout = rctx.attr.timeout, - ) - if result.return_code: - fail("whl_library %s failed: %s (%s) error code: '%s'" % (rctx.attr.name, result.stdout, result.stderr, result.return_code)) + whl_path = None + whl_label = None + if rctx.attr.experimental_whl_label: + # This label may be a hub repo if that is the case, resolve it to the + # spoke repo label. For reasons why see `../private/bzlmod/pypi_metadata.bzl`. + # + # This assumes that whls are only downloaded with the pypi_file repo rule. + whl_label = rctx.attr.experimental_whl_label + whl_path = rctx.path(Label("@@{hub_repo_name}_{dist}//:{filename}".format( + hub_repo_name = whl_label.workspace_name, + dist = whl_label.package, + filename = whl_label.name, + ))) + + if whl_path.basename.endswith("tar.gz"): + whl_path = None + whl_label = None + + if whl_path == None: + result = rctx.execute( + args, + environment = environment, + quiet = rctx.attr.quiet, + timeout = rctx.attr.timeout, + ) + if result.return_code: + fail("whl_library %s failed: %s (%s) error code: '%s'" % (rctx.attr.name, result.stdout, result.stderr, result.return_code)) - whl_path = rctx.path(json.decode(rctx.read("whl_file.json"))["whl_file"]) - if not rctx.delete("whl_file.json"): - fail("failed to delete the whl_file.json file") + whl_path = rctx.path(json.decode(rctx.read("whl_file.json"))["whl_file"]) + if not rctx.delete("whl_file.json"): + fail("failed to delete the whl_file.json file") if rctx.attr.whl_patches: patches = {} @@ -791,7 +810,7 @@ def _whl_library_impl(rctx): build_file_contents = generate_whl_library_build_bazel( repo_prefix = rctx.attr.repo_prefix, - whl_name = whl_path.basename, + whl_name = whl_label or whl_path.basename, dependencies = metadata["deps"], dependencies_by_platform = metadata["deps_by_platform"], group_name = rctx.attr.group_name, @@ -803,6 +822,10 @@ def _whl_library_impl(rctx): ], entry_points = entry_points, annotation = None if not rctx.attr.annotation else struct(**json.decode(rctx.read(rctx.attr.annotation))), + impl_vis = None if not rctx.attr.experimental_whl_label else "@{}{}//:__pkg__".format( + rctx.attr.repo_prefix, + normalize_name(metadata["name"]), + ), ) rctx.file("BUILD.bazel", build_file_contents) @@ -844,6 +867,20 @@ whl_library_attrs = { ), allow_files = True, ), + "experimental_whl_label": attr.label( + doc = """\ +The label of the whl file to use. This allows one to pass a whl file to be used, but at the same +time it changes the assumed whl_library layout. With this parameter set, the pip repository layout +becomes as following: +* pip has aliases to wheel libraries based on the version of the toolchain in use. +* downloaded whls and sdists are in separate `pypi_file` repos. +* per each downloaded `whl` or `sdist` there is a `whl_library` that creates a `py_library` target. +* there is a alias `whl_library` repo that allows selecting which implementation `whl_library` repo to use. + +In the future the `whl_library` alias repo might be merged into the main `pip` repo but for that to +happen, the `dependency` closures need to be generated for the selected target python versions. +""", + ), "group_deps": attr.string_list( doc = "List of dependencies to skip in order to break the cycles within a dependency group.", default = [], diff --git a/python/pip_install/private/generate_whl_library_build_bazel.bzl b/python/pip_install/private/generate_whl_library_build_bazel.bzl index 568b00e4d..1cb430e0d 100644 --- a/python/pip_install/private/generate_whl_library_build_bazel.bzl +++ b/python/pip_install/private/generate_whl_library_build_bazel.bzl @@ -137,7 +137,8 @@ def generate_whl_library_build_bazel( entry_points, annotation = None, group_name = None, - group_deps = []): + group_deps = [], + impl_vis = None): """Generate a BUILD file for an unzipped Wheel Args: @@ -155,6 +156,7 @@ def generate_whl_library_build_bazel( group_deps: List[str]; names of fellow members of the group (if any). These will be excluded from generated deps lists so as to avoid direct cycles. These dependencies will be provided at runtime by the group rules which wrap this library and its fellows together. + impl_vis: str; override the visibility of the implementation labels. Returns: A complete BUILD file as a string @@ -267,12 +269,12 @@ config_setting( group_repo = repo_prefix + "_groups" library_impl_label = "@%s//:%s_%s" % (group_repo, normalize_name(group_name), PY_LIBRARY_PUBLIC_LABEL) whl_impl_label = "@%s//:%s_%s" % (group_repo, normalize_name(group_name), WHEEL_FILE_PUBLIC_LABEL) - impl_vis = "@%s//:__pkg__" % (group_repo,) + impl_vis = impl_vis or "@%s//:__pkg__" % (group_repo,) else: library_impl_label = PY_LIBRARY_IMPL_LABEL whl_impl_label = WHEEL_FILE_IMPL_LABEL - impl_vis = "//visibility:private" + impl_vis = impl_vis or "//visibility:private" contents = "\n".join( [ diff --git a/python/private/bzlmod/multiarch_whl_library.bzl b/python/private/bzlmod/multiarch_whl_library.bzl new file mode 100644 index 000000000..c93092cbd --- /dev/null +++ b/python/private/bzlmod/multiarch_whl_library.bzl @@ -0,0 +1,298 @@ +# Copyright 2023 The Bazel Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This stores the `bzlmod` specific `multiarch_whl_library` that can make more +assumptions about how the repositories are setup and we can also fetch some data +from the PyPI via its simple API. + +There is a single `multiarch_whl_library` repository that only does aliases by +target platform to the available whls and/or the whl built from sdist by bazel. +The main algorithm is as below: +1. Fetch data from all known PyPI indexes about where the wheels live. +2. Ensure that we have found info about all of the packages by checking that we know + URL for each artifact that is mentioned by its sha256 in the requirements file. +3. Infer the compatible platforms for the artifacts by parsing the last entry + in the URL. This allows us to create a set of `whl_library` repos that (mostly) + do not depend on the host platform. + +Shortcomings of the design: +- We need to wait for the module extension to pull metadata about existing packgaes + for multiple distributions and it scales as 'number of different packgaes' x + 'number of indexes'. We do some optimizations that make the scaling better, + but without `MODULE.bazel.lock` we need to do this every time if we haven't + cached the repositories. + +- whl annotation API may further break user workflows because the targets added + via `additive_build_content` is not added to the `pip_XY_foo` but rather to + `pip_XY_foo__plat` which suggests that we should have some way to tell the + `hub` repo to expose extra alias targets. + + The `additive_build_content`, `copy_files` and `copy_files` are applied to each + extracted `whl` but not exposed to the user via extra alias definition in the hub + repository. + + The `data`, `data_exclude_glob` and `srcs_exclude_glob` are all forwarded to the + definition of the `py_library` target as expected. + +- The cyclic dependencies can-not be automatically resolved yet and we need to fetch + additional whl metadata unless we have the METADATA parsing in fewer places. + +- For now, the wheels are potentially extracted multiple times, but this could be + possible to improve if we unify selecting based on the target platform and python + version into a single `config_setting`. + +Benefits of the design: +- Really fast to iterate as the whls do not need to be re-downloaded if the + sha256 and the whl URL does not change. + +- We can use the same downloaded wheel in multiple `whl_library` instances that + are for different Python versions. + +- The dependency closures are still isolated making this a relatively safe change + from the traditional `whl_library`. + +- This could be extended very easily to consume `poetry.lock` or `pdm.lock` files. + +- We can build `rules_oci` images without needed extra work if the `sdists` are for + pure Python `whls` without any extra effort or needing to specify `download = True`. + +- We can download the Simple API contents in parallel with changes landed for 7.1.0. +""" + +load("//python/pip_install:pip_repository.bzl", "whl_library") +load( + "//python/private:labels.bzl", + "DATA_LABEL", + "DIST_INFO_LABEL", + "PY_LIBRARY_IMPL_LABEL", + "PY_LIBRARY_PUBLIC_LABEL", + "WHEEL_FILE_IMPL_LABEL", + "WHEEL_FILE_PUBLIC_LABEL", +) +load("//python/private:normalize_name.bzl", "normalize_name") +load("//python/private:parse_whl_name.bzl", "parse_whl_name") +load("//python/private:text_util.bzl", "render") +load("//python/private:whl_target_platforms.bzl", "whl_target_platforms") + +def multiarch_whl_library(name, *, requirement_by_os, files, extra_pip_args, **kwargs): + """Generate a number of third party repos for a particular wheel. + + Args: + name(str): the name of the apparent repo that does the select on the target platform. + requirement_by_os(dict[str]): the requirement_by_os line that this repo corresponds to. + files(dict[str, PyPISource]): the list of file labels + extra_pip_args(list[str]): The pip args by platform. + **kwargs: extra arguments passed to the underlying `whl_library` repository rule. + """ + needed_shas = {} + for os, requirement in requirement_by_os.items(): + if os == "host": + continue + + for sha in requirement.split("--hash=sha256:")[1:]: + sha = sha.strip() + if sha not in needed_shas: + needed_shas[sha] = [] + + needed_shas[sha].append(os) + + needed_files = { + files.files[sha]: plats + for sha, plats in needed_shas.items() + } + _, _, want_abi = kwargs.get("repo").rpartition("_") + + # TODO @aignas 2023-12-20: how can we get the ABI that we need for this particular repo? It would be better to not need to resolve it and just add it to the `target_platforms` list for the user to provide. + want_abi = "cp" + want_abi + files = {} + for f, oses in needed_files.items(): + if not f.filename.endswith(".whl"): + files["sdist"] = (f, requirement_by_os["host"]) + continue + + parsed = parse_whl_name(f.filename) + + if "musl" in parsed.platform_tag: + # TODO @aignas 2023-12-21: musl wheels are currently unsupported, how can we allow the user to control this? Maybe by target platforms? + continue + + if parsed.abi_tag in ["none", "abi3", want_abi]: + plat = parsed.platform_tag.split(".")[0] + if plat == "any": + files[plat] = (f, requirement_by_os[oses[0]]) + else: + # this assumes that the target_platform for a whl will have the same os, which is most often correct + target_platform = whl_target_platforms(plat)[0] + files[plat] = (f, requirement_by_os.get(target_platform.os, requirement_by_os["default"])) + + libs = {} + for plat, (f, r) in files.items(): + whl_name = "{}__{}".format(name, plat) + libs[plat] = f.filename + req, hash, _ = r.partition("--hash=sha256:") + req = "{} {}{}".format(req.strip(), hash, f.sha256) + whl_library( + name = whl_name, + experimental_whl_label = f.label, + requirement = req, + extra_pip_args = extra_pip_args, + **kwargs + ) + + whl_minihub( + name = name, + repo = kwargs.get("repo"), + group_name = kwargs.get("group_name"), + libs = libs, + annotation = kwargs.get("annotation"), + ) + +def _whl_minihub_impl(rctx): + abi = "cp" + rctx.attr.repo.rpartition("_")[2] + _, repo, suffix = rctx.attr.name.rpartition(rctx.attr.repo) + prefix = repo + suffix + + build_contents = [] + + actual = None + select = {} + for plat, filename in rctx.attr.libs.items(): + tmpl = "@{}__{}//:{{target}}".format(prefix, plat) + + # TODO @aignas 2023-12-20: check if we have 'download_only = True' passed + # to the `whl_library` and then remove the `sdist` from the select and + # add a no_match error message. + if plat == "sdist": + select["//conditions:default"] = tmpl + continue + + whl = parse_whl_name(filename) + + # prefer 'abi3' over 'py3'? + if "py3" in whl.python_tag or "abi3" in whl.python_tag: + select["//conditions:default"] = tmpl + break + + if abi != whl.abi_tag: + continue + + for p in whl_target_platforms(whl.platform_tag): + platform = "is_{}_{}".format(p.os, p.cpu) + select[":" + platform] = tmpl + + config_setting = """\ +config_setting( + name = "{platform}", + constraint_values = [ + "@platforms//cpu:{cpu}", + "@platforms//os:{os}", + ], + visibility = ["//visibility:private"], +)""".format(platform = platform, cpu = p.cpu, os = p.os) + if config_setting not in build_contents: + build_contents.append(config_setting) + + if len(select) == 1 and "//conditions:default" in select: + actual = repr(select["//conditions:default"]) + + select = {k: v for k, v in sorted(select.items())} + + # The overall architecture: + # * `whl_library_for_a_whl should generate only the private targets + # * `whl_minihub` should do the `group` to `private` indirection as needed. + # + # then the group visibility settings remain the same. + # then we can also set the private target visibility to something else than public + # e.g. the _sha265 targets can only be accessed by the minihub + + group_name = rctx.attr.group_name + if group_name: + group_repo = rctx.attr.repo + "__groups" + impl_vis = "@{}//:__pkg__".format(group_repo) + library_impl_label = "@%s//:%s_%s" % (group_repo, normalize_name(group_name), "pkg") + whl_impl_label = "@%s//:%s_%s" % (group_repo, normalize_name(group_name), "whl") + else: + library_impl_label = PY_LIBRARY_IMPL_LABEL + whl_impl_label = WHEEL_FILE_IMPL_LABEL + impl_vis = "//visibility:private" + + public_visibility = "//visibility:public" + + alias_targets = { + DATA_LABEL: public_visibility, + DIST_INFO_LABEL: public_visibility, + PY_LIBRARY_IMPL_LABEL: impl_vis, + WHEEL_FILE_IMPL_LABEL: impl_vis, + } + + if rctx.attr.annotation: + annotation = struct(**json.decode(rctx.read(rctx.attr.annotation))) + + for dest in annotation.copy_files.values(): + alias_targets["{}.copy".format(dest)] = public_visibility + + for dest in annotation.copy_executables.values(): + alias_targets["{}.copy".format(dest)] = public_visibility + + # FIXME @aignas 2023-12-14: is this something that we want, looks a + # little bit hacky as we don't parse the visibility of the extra + # targets. + if annotation.additive_build_content: + targets_defined_in_additional_info = [ + line.partition("=")[2].strip().strip("\"',") + for line in annotation.additive_build_content.split("\n") + if line.strip().startswith("name") + ] + for dest in targets_defined_in_additional_info: + alias_targets[dest] = public_visibility + + build_contents += [ + render.alias( + name = target, + actual = actual.format(target = target) if actual else render.select({k: v.format(target = target) for k, v in select.items()}), + visibility = [visibility], + ) + for target, visibility in alias_targets.items() + ] + + build_contents += [ + render.alias( + name = target, + actual = repr(actual), + visibility = ["//visibility:public"], + ) + for target, actual in { + PY_LIBRARY_PUBLIC_LABEL: library_impl_label, + WHEEL_FILE_PUBLIC_LABEL: whl_impl_label, + }.items() + ] + + rctx.file("BUILD.bazel", "\n\n".join(build_contents)) + +whl_minihub = repository_rule( + attrs = { + "annotation": attr.label( + doc = ( + "Optional json encoded file containing annotation to apply to the extracted wheel. " + + "See `package_annotation`" + ), + allow_files = True, + ), + "group_name": attr.string(), + "libs": attr.string_dict(mandatory = True), + "repo": attr.string(mandatory = True), + }, + doc = """A rule for bzlmod mulitple pip repository creation. PRIVATE USE ONLY.""", + implementation = _whl_minihub_impl, +) diff --git a/python/private/bzlmod/pip.bzl b/python/private/bzlmod/pip.bzl index ce3ddde66..c9a1efd82 100644 --- a/python/private/bzlmod/pip.bzl +++ b/python/private/bzlmod/pip.bzl @@ -14,7 +14,6 @@ "pip module extension for use with bzlmod" -load("@bazel_features//:features.bzl", "bazel_features") load("@pythons_hub//:interpreters.bzl", "DEFAULT_PYTHON_VERSION", "INTERPRETER_LABELS") load( "//python/pip_install:pip_repository.bzl", @@ -28,7 +27,9 @@ load("//python/pip_install:requirements_parser.bzl", parse_requirements = "parse load("//python/private:normalize_name.bzl", "normalize_name") load("//python/private:parse_whl_name.bzl", "parse_whl_name") load("//python/private:version_label.bzl", "version_label") +load(":multiarch_whl_library.bzl", "multiarch_whl_library") load(":pip_repository.bzl", "pip_repository") +load(":pypi_metadata.bzl", "whl_files_from_requirements") def _parse_version(version): major, _, version = version.partition(".") @@ -97,11 +98,14 @@ You cannot use both the additive_build_content and additive_build_content_file a whl_mods = whl_mods, ) -def _create_whl_repos(module_ctx, pip_attr, whl_map, whl_overrides): +def _create_whl_repos(module_ctx, pip_attr, whl_map, whl_overrides, files): python_interpreter_target = pip_attr.python_interpreter_target # if we do not have the python_interpreter set in the attributes # we programmatically find it. + # + # TODO @aignas 2023-12-20: figure out how to set a parameter that is + # not platform specific hub_name = pip_attr.hub_name if python_interpreter_target == None and not pip_attr.python_interpreter: python_name = "python_{}_host".format( @@ -122,26 +126,6 @@ def _create_whl_repos(module_ctx, pip_attr, whl_map, whl_overrides): hub_name, version_label(pip_attr.python_version), ) - requrements_lock = locked_requirements_label(module_ctx, pip_attr) - - # Parse the requirements file directly in starlark to get the information - # needed for the whl_libary declarations below. - requirements_lock_content = module_ctx.read(requrements_lock) - parse_result = parse_requirements(requirements_lock_content) - - # Replicate a surprising behavior that WORKSPACE builds allowed: - # Defining a repo with the same name multiple times, but only the last - # definition is respected. - # The requirement lines might have duplicate names because lines for extras - # are returned as just the base package name. e.g., `foo[bar]` results - # in an entry like `("foo", "foo[bar] == 1.0 ...")`. - requirements = { - normalize_name(entry[0]): entry - # The WORKSPACE pip_parse sorted entries, so mimic that ordering. - for entry in sorted(parse_result.requirements) - }.values() - - extra_pip_args = pip_attr.extra_pip_args + parse_result.options if hub_name not in whl_map: whl_map[hub_name] = {} @@ -169,8 +153,50 @@ def _create_whl_repos(module_ctx, pip_attr, whl_map, whl_overrides): groups = pip_attr.experimental_requirement_cycles, ) + requirements_lock = { + # Parse the requirements file directly in starlark to get the information + # needed for the whl_libary declarations below. + os: parse_requirements(module_ctx.read(lock)) + for os, lock in { + "default": pip_attr.requirements_lock, + "host": locked_requirements_label(module_ctx, pip_attr), + "linux": pip_attr.requirements_linux, + "osx": pip_attr.requirements_darwin, + "windows": pip_attr.requirements_windows, + }.items() + if lock + } + + # Replicate a surprising behavior that WORKSPACE builds allowed: + # Defining a repo with the same name multiple times, but only the last + # definition is respected. + # The requirement lines might have duplicate names because lines for extras + # are returned as just the base package name. e.g., `foo[bar]` results + # in an entry like `("foo", "foo[bar] == 1.0 ...")`. + requirements = {} + for os, parse_result in requirements_lock.items(): + for whl_name, reqs in { + normalize_name(entry[0]): entry + # The WORKSPACE pip_parse sorted entries, so mimic that ordering. + for entry in sorted(parse_result.requirements) + }.values(): + if whl_name not in requirements: + requirements[whl_name] = {} + + requirements[whl_name][os] = reqs + + extra_pip_args_per_os = { + os: pip_attr.extra_pip_args + parse_result.options + for os, parse_result in requirements_lock.items() + } + for os, extra_args in extra_pip_args_per_os.items(): + if extra_pip_args_per_os["host"] != extra_args: + fail("the pip arguments in the requirements files should be the same") + + extra_pip_args = extra_pip_args_per_os["host"] + # Create a new wheel library for each of the different whls - for whl_name, requirement_line in requirements: + for whl_name, requirement_by_os in requirements.items(): # We are not using the "sanitized name" because the user # would need to guess what name we modified the whl name # to. @@ -179,23 +205,17 @@ def _create_whl_repos(module_ctx, pip_attr, whl_map, whl_overrides): group_name = whl_group_mapping.get(whl_name) group_deps = requirement_cycles.get(group_name, []) - whl_library( + common_args = dict( name = "%s_%s" % (pip_name, whl_name), - requirement = requirement_line, repo = pip_name, repo_prefix = pip_name + "_", annotation = annotation, - whl_patches = { - p: json.encode(args) - for p, args in whl_overrides.get(whl_name, {}).items() - }, experimental_target_platforms = pip_attr.experimental_target_platforms, python_interpreter = pip_attr.python_interpreter, python_interpreter_target = python_interpreter_target, quiet = pip_attr.quiet, timeout = pip_attr.timeout, isolated = use_isolated(module_ctx, pip_attr), - extra_pip_args = extra_pip_args, download_only = pip_attr.download_only, pip_data_exclude = pip_attr.pip_data_exclude, enable_implicit_namespace_pkgs = pip_attr.enable_implicit_namespace_pkgs, @@ -204,6 +224,28 @@ def _create_whl_repos(module_ctx, pip_attr, whl_map, whl_overrides): group_deps = group_deps, ) + if files: + multiarch_whl_library( + files = files[whl_name], + requirement_by_os = requirement_by_os, + extra_pip_args = extra_pip_args, + # patching is done in whl_files_from_requirements + **common_args + ) + else: + whl_library( + requirement = requirement_by_os["host"], + extra_pip_args = extra_pip_args, + whl_patches = { + p: json.encode(args) + for p, args in whl_overrides.get(whl_name, {}).items() + }, + **common_args + ) + + if hub_name not in whl_map: + whl_map[hub_name] = {} + if whl_name not in whl_map[hub_name]: whl_map[hub_name][whl_name] = {} @@ -307,6 +349,13 @@ def _pip_impl(module_ctx): whl_overrides[whl_name][patch].whls.append(attr.file) + # TODO @aignas 2023-12-18: figure out how to make the behaviour switchable + files = whl_files_from_requirements( + module_ctx = module_ctx, + name = "pypi_whl", + whl_overrides = whl_overrides, + ) + # Used to track all the different pip hubs and the spoke pip Python # versions. pip_hub_map = {} @@ -351,7 +400,12 @@ def _pip_impl(module_ctx): else: pip_hub_map[pip_attr.hub_name].python_versions.append(pip_attr.python_version) - _create_whl_repos(module_ctx, pip_attr, hub_whl_map, whl_overrides) + _create_whl_repos(module_ctx, pip_attr, hub_whl_map, whl_overrides, files) + + # TODO @aignas 2024-01-11: the ideal design would have the whl minihubs + # passed in the pip_repository and the selection by version is happening + # inside the `whl_minihub` alias. However that requires per-python-version + # whl METADATA parsing. for hub_name, whl_map in hub_whl_map.items(): pip_repository( @@ -469,6 +523,33 @@ cannot have a child module that uses the same `hub_name`. } return attrs +_target_platforms = tag_class( + attrs = { + "enabled": attr.bool( + doc = """\ +Enable this feature in this root module by setting this to True. +""", + mandatory = True, + ), + "extra_index_urls": attr.string_list( + doc = """\ +Extra index URLs to look up for remaining packages. +""", + ), + "index_url": attr.string( + default = "https://pypi.org/simple", + doc = """\ +The PyPI index URL to fetch the majority of the packages from. +""", + ), + }, + doc = """\ +Setup multi-platform support for PyPI hub repositories. +NOTE: this feature is still very experimental and as a result may contain +breaking changes more frequently than described in our breaking change policy. +""", +) + # NOTE: the naming of 'override' is taken from the bzlmod native # 'archive_override', 'git_override' bzlmod functions. _override_tag = tag_class( @@ -498,17 +579,6 @@ Apply any overrides (e.g. patches) to a given Python distribution defined by other tags in this extension.""", ) -def _extension_extra_args(): - args = {} - - if bazel_features.external_deps.module_extension_has_os_arch_dependent: - args = args | { - "arch_dependent": True, - "os_dependent": True, - } - - return args - pip = module_extension( doc = """\ This extension is used to make dependencies from pip available. @@ -529,6 +599,7 @@ the BUILD files for wheels. """, implementation = _pip_impl, tag_classes = { + "experimental_target_platforms": _target_platforms, "override": _override_tag, "parse": tag_class( attrs = _pip_parse_ext_attrs(), @@ -551,7 +622,6 @@ extension. """, ), }, - **_extension_extra_args() ) def _whl_mods_repo_impl(rctx): diff --git a/python/private/bzlmod/pypi_archive.bzl b/python/private/bzlmod/pypi_archive.bzl new file mode 100644 index 000000000..3cbf04999 --- /dev/null +++ b/python/private/bzlmod/pypi_archive.bzl @@ -0,0 +1,147 @@ +# Copyright 2023 The Bazel Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"TODO" + +load("//python:versions.bzl", "WINDOWS_NAME") +load("//python/private:auth.bzl", "get_auth") +load("//python/private:bzlmod_enabled.bzl", "BZLMOD_ENABLED") +load("//python/private:patch_whl.bzl", "patch_whl") +load("//python/private:toolchains_repo.bzl", "get_host_os_arch") + +_HTTP_FILE_DOC = """See documentation for the attribute with the same name +in [http_file docs](https://bazel.build/rules/lib/repo/http#http_file document).""" + +def _impl(rctx): + prefix, _, _ = rctx.attr.name.rpartition("_") + prefix, _, _ = prefix.rpartition("_") + + _, _, filename = rctx.attr.urls[0].rpartition("/") + output = "whl/{}".format(filename.strip()) + + urls = rctx.attr.urls + auth = get_auth(rctx, urls) + + result = rctx.download( + url = urls, + output = output, + sha256 = rctx.attr.sha256, + auth = auth, + canonical_id = rctx.attr.canonical_id, + integrity = rctx.attr.integrity, + ) + if not result.success: + fail(result) + + whl_path = rctx.path(output) + if not whl_path.exists: + fail("BUG: the downloaded path does not exist, but the download was successfull") + + if rctx.attr.patches: + whl_path = patch_whl( + rctx, + python_interpreter = _resolve_python_interpreter(rctx), + whl_path = whl_path, + patched_whl_path = "patched/{}".format(whl_path.basename), + patches = { + p: int(strip) + for p, strip in rctx.attr.patches.items() + }, + quiet = rctx.attr.quiet, + ) + + # NOTE @aignas 2023-12-20: this symlink is to ensure that the + # rctx.path(label) resolves to the right file. + rctx.symlink(whl_path, whl_path.basename) + + rctx.file( + "BUILD.bazel", + """\ +exports_files( + ["{filename}"], + visibility=["//visibility:public"], +) +""".format(filename = whl_path.basename), + ) + +_pypi_file_attrs = { + "auth_patterns": attr.string_dict(doc = _HTTP_FILE_DOC), + "canonical_id": attr.string(doc = _HTTP_FILE_DOC), + "integrity": attr.string(doc = _HTTP_FILE_DOC), + "netrc": attr.string(doc = _HTTP_FILE_DOC), + "patches": attr.label_keyed_string_dict( + doc = """\ +A label-keyed-string dict that has patch_strip as the value and the patch to be applied as +a label. The patches are applied in the same order as they are listed in the dictionary. +""", + ), + "python_interpreter": attr.string(doc = "The python interpreter to use when patching"), + "python_interpreter_target": attr.label(doc = "The python interpreter target to use when patching"), + "quiet": attr.bool(doc = "Silence the stdout/stdeer during patching", default = True), + "sha256": attr.string(doc = _HTTP_FILE_DOC), + "urls": attr.string_list(doc = _HTTP_FILE_DOC), + "_tools": attr.label_list( + default = ["//python/private:repack_whl.py"], + ), +} + +pypi_file = repository_rule( + attrs = _pypi_file_attrs, + doc = """A rule for downloading a single file from a PyPI like index.""", + implementation = _impl, +) + +# TODO @aignas 2023-12-16: expose getting interpreter +def _get_python_interpreter_attr(rctx): + """A helper function for getting the `python_interpreter` attribute or it's default + + Args: + rctx (repository_ctx): Handle to the rule repository context. + + Returns: + str: The attribute value or it's default + """ + if rctx.attr.python_interpreter: + return rctx.attr.python_interpreter + + if "win" in rctx.os.name: + return "python.exe" + else: + return "python3" + +def _resolve_python_interpreter(rctx): + """Helper function to find the python interpreter from the common attributes + + Args: + rctx: Handle to the rule repository context. + Returns: Python interpreter path. + """ + python_interpreter = _get_python_interpreter_attr(rctx) + + if rctx.attr.python_interpreter_target != None: + python_interpreter = rctx.path(rctx.attr.python_interpreter_target) + + if BZLMOD_ENABLED: + (os, _) = get_host_os_arch(rctx) + + # On Windows, the symlink doesn't work because Windows attempts to find + # Python DLLs where the symlink is, not where the symlink points. + if os == WINDOWS_NAME: + python_interpreter = python_interpreter.realpath + elif "/" not in python_interpreter: + found_python_interpreter = rctx.which(python_interpreter) + if not found_python_interpreter: + fail("python interpreter `{}` not found in PATH".format(python_interpreter)) + python_interpreter = found_python_interpreter + return python_interpreter diff --git a/python/private/bzlmod/pypi_metadata.bzl b/python/private/bzlmod/pypi_metadata.bzl new file mode 100644 index 000000000..d035bd609 --- /dev/null +++ b/python/private/bzlmod/pypi_metadata.bzl @@ -0,0 +1,295 @@ +# Copyright 2023 The Bazel Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""PyPI metadata hub and spoke repos""" + +load("@bazel_features//:features.bzl", "bazel_features") +load("//python/pip_install:requirements_parser.bzl", parse_requirements = "parse") +load("//python/private:normalize_name.bzl", "normalize_name") +load("//python/private:text_util.bzl", "render") +load(":pypi_archive.bzl", "pypi_file") + +def PyPISource(*, filename, label, sha256): + """Create a PyPISource struct. + + Args: + filename(str): The filename of the source. + label(str or Label): The label to the source. + sha256(str): The sha256 of the source, useful for matching against the `requirements` line. + + Returns: + struct with filename(str), label(Label) and sha256(str) attributes + """ + return struct( + filename = filename, + label = label, + sha256 = sha256, + ) + +def whl_files_from_requirements(module_ctx, *, name, whl_overrides = {}): + """Fetch archives for all requirements files using the bazel downloader. + + Args: + module_ctx: The module_ctx struct from the extension. + name: The prefix of the fetched archive repos. + whl_overrides: patches to be applied after fetching. + + Returns: + a dict with the fetched metadata to be used later when creating hub and spoke repos. + """ + enabled = False + indexes = [] + for module in module_ctx.modules: + for attr in module.tags.experimental_target_platforms: + if not module.is_root: + fail("setting target platforms is only supported in root modules") + + enabled = attr.enabled + for index in [attr.index_url] + attr.extra_index_urls: + if index not in indexes: + indexes.append(index) + break + + if not enabled: + return None + + requirements_files = [ + requirements_lock + for module in module_ctx.modules + for pip_attr in module.tags.parse + for requirements_lock in [ + pip_attr.requirements_lock, + pip_attr.requirements_linux, + pip_attr.requirements_darwin, + pip_attr.requirements_windows, + ] + if requirements_lock + ] + + sha256s_by_distribution = {} + for requirements_lock in requirements_files: + requirements_lock_content = module_ctx.read(requirements_lock) + parse_result = parse_requirements(requirements_lock_content) + for distribution, line in parse_result.requirements: + sha256s = [sha.strip() for sha in line.split("--hash=sha256:")[1:]] + distribution = normalize_name(distribution) + + if distribution not in sha256s_by_distribution: + sha256s_by_distribution[distribution] = {} + + for sha in sha256s: + sha256s_by_distribution[distribution][sha] = True + + metadata = _fetch_metadata( + module_ctx, + sha256s_by_distribution = sha256s_by_distribution, + indexes = indexes, + ) + + ret = {} + repos = {} + + for distribution, m in metadata.items(): + files = {} + + for file in m.files: + _, _, filename = file.url.rpartition("/") + suffix = "{distribution}_{sha}".format(distribution = distribution, sha = file.sha256[:8]) + + # We could use http_file, but we want to also be able to patch the whl + # file, which is something http_file does not know how to do. + # if the url is known (in case of using pdm lock), we could use an + # http_file. + + patches = { + patch_file: str(patch_dst.patch_strip) + for patch_file, patch_dst in whl_overrides.get(distribution, {}).items() + if filename in patch_dst.whls + } + + pypi_file( + name = name + "_" + suffix, + sha256 = file.sha256, + # FIXME @aignas 2023-12-18: consider if we should replace this + # with http_file + whl_library from pycross that philsc is + # working on. + # + # In the long term, it may be easier to maintain, especially + # since this implementation needs to copy functionality around + # credential helpers, etc to be useful. + # + # I tried to use `http_file` and do patching in a separate repository + # rule and it failed since the patching (and whl_library) depends on the + # correct filename of the downloaded file, which can be set via + # `downloaded_file_path`. However, that does not create a + # symlink called `file` next to the target, which means that + # the result becomes unusable in the repository_ctx.path + # function. If the patching and extracting is done with build + # actions, like the `py_whl_library` is doing, then we could in + # theory just use `http_file`. + patches = patches, + urls = [file.url], + # FIXME @aignas 2023-12-15: add usage of the DEFAULT_PYTHON_VERSION + # to get the hermetic interpreter + ) + + files[file.sha256] = PyPISource( + filename = filename, + label = "@{name}//{suffix}:{filename}".format( + name = name, + suffix = suffix, + filename = filename, + ), + sha256 = file.sha256, + ) + repos[suffix] = filename + + ret[normalize_name(distribution)] = struct( + distribution = distribution, + files = files, + ) + + _pypi_metadata_hub( + name = name, + repo = name, + repos = repos, + ) + + return ret + +def _hub_impl(rctx): + aliases = { + suffix: "@{}_{}//:{}".format(rctx.attr.repo, suffix, filename) + for suffix, filename in rctx.attr.repos.items() + } + for suffix, filename in rctx.attr.repos.items(): + rctx.file( + "{}/BUILD.bazel".format(suffix), + render.alias( + name = filename, + actual = repr(aliases[suffix]), + visibility = ["//visibility:public"], + ), + ) + +_pypi_metadata_hub = repository_rule( + _hub_impl, + attrs = { + "repo": attr.string(mandatory = True), + "repos": attr.string_dict(mandatory = True), + }, +) + +def _fetch_metadata(module_ctx, *, sha256s_by_distribution, indexes): + # Create a copy that is mutable within this context and use it like a queue + want = { + d: {sha: True for sha in shas.keys()} + for d, shas in sha256s_by_distribution.items() + } + got = {} + + for i, index_url in enumerate(indexes): + # Fetch from each index one by one so that we could do less work when fetching from the next index. + download_kwargs = {} + if bazel_features.external_deps.download_has_block_param: + download_kwargs["block"] = False + + got_urls = _fetch_urls_from_index( + module_ctx, + index_url = index_url, + need_to_download = want, + fname_prefix = "index-{}".format(i), + **download_kwargs + ) + + for distribution, shas in got_urls.items(): + if distribution not in got: + got[distribution] = {} + + for sha256, url in shas.items(): + got[distribution][sha256] = url + want[distribution].pop(sha256) + + if not want[distribution]: + want.pop(distribution) + + if want: + fail("Could not find files for: {}".format(want)) + + return { + distribution: struct( + files = [ + struct( + url = url, + sha256 = sha256, + ) + for sha256, url in urls.items() + ], + ) + for distribution, urls in got.items() + } + +def _fetch_urls_from_index(module_ctx, *, index_url, need_to_download, fname_prefix, **download_kwargs): + downloads = {} + for distribution in need_to_download: + downloads[distribution] = {} + fname = "{}-{}.html".format(fname_prefix, distribution) + download = module_ctx.download( + # NOTE @aignas 2023-12-29: the trailing slash is important, because + # curl would do a redirect and some private registries may break if + # the authentication is required and the trailing slash is not present. + url = "{}/{}/".format(index_url.rstrip("/"), distribution), + output = fname, + **download_kwargs + ) + + if not download_kwargs.get("block", True): + downloads[distribution] = (download, fname) + elif not download.success: + fail(download) + else: + downloads[distribution] = fname + + if not download_kwargs.get("block", True): + for distribution, (download, fname) in downloads.items(): + result = download.wait() + if not result.success: + fail(result) + + downloads[distribution] = fname + + got_urls = {} + for distribution, fname in downloads.items(): + got_urls[distribution] = {} + contents = module_ctx.read(fname) + got_shas = _parse_simple_api(contents, need_to_download[distribution]) + for sha256, url in got_shas: + got_urls[distribution][sha256] = url + + return got_urls + +def _parse_simple_api(html, want_shas): + got = [] + + _, _, hrefs = html.partition(" list[pathlib.Path]: return got_files + extra_files + got_distinfos + extra_distinfos +def _move_files(*, src: pathlib.Path, dest: pathlib.Path, excludes: list[pathlib.Path]): + """Move the files recursively to ensure that excludes are respected. + + This ensure that excludes being in sub-directories relative to the CWD will still + be honoured. + """ + for p in src.glob("*"): + if p in excludes: + logging.debug(f"Ignoring: {p}") + continue + rel_path = p.relative_to(src) + dst = dest / rel_path + + is_relative_to_excludes = [x for x in excludes if x.is_relative_to(p)] + if is_relative_to_excludes: + _move_files(src=p, dest=dst, excludes=is_relative_to_excludes) + continue + + dst.parent.mkdir(exist_ok=True) + p.rename(dst) + logging.debug(f"mv {p} -> {dst}") + + def main(sys_argv): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( @@ -134,23 +157,17 @@ def main(sys_argv): patched_wheel_dir = cwd / tmpdir logging.debug(f"Created a tmpdir: {patched_wheel_dir}") - excludes = [args.whl_path, patched_wheel_dir] + exclude_top_level = [args.whl_path, patched_wheel_dir] logging.debug("Moving whl contents to the newly created tmpdir") - for p in cwd.glob("*"): - if p in excludes: - logging.debug(f"Ignoring: {p}") - continue - - rel_path = p.relative_to(cwd) - dst = p.rename(patched_wheel_dir / rel_path) - logging.debug(f"mv {p} -> {dst}") + _move_files(src=cwd, dest=patched_wheel_dir, excludes=exclude_top_level) distinfo_dir = next(iter(patched_wheel_dir.glob("*dist-info"))) logging.debug(f"Found dist-info dir: {distinfo_dir}") record_path = distinfo_dir / "RECORD" record_contents = record_path.read_text() if record_path.exists() else "" + args.output.parent.mkdir(parents=True, exist_ok=True) with _WhlFile(args.output, mode="w", distinfo_dir=distinfo_dir) as out: for p in _files_to_pack(patched_wheel_dir, record_contents): rel_path = p.relative_to(patched_wheel_dir)