diff --git a/examples/build_file_generation/BUILD b/examples/build_file_generation/BUILD index ec31255e90..3e6d44a9e9 100644 --- a/examples/build_file_generation/BUILD +++ b/examples/build_file_generation/BUILD @@ -1,8 +1,17 @@ load("@bazel_gazelle//:def.bzl", "gazelle") +load("@pip//:requirements.bzl", "all_whl_requirements") load("@rules_python//gazelle:def.bzl", "GAZELLE_PYTHON_RUNTIME_DEPS") load("@rules_python//gazelle/manifest:defs.bzl", "gazelle_python_manifest") +load("@rules_python//gazelle/modules_mapping:def.bzl", "modules_mapping") load("@rules_python//python:defs.bzl", "py_library") +# This rule fetches the metadata for python packages we depend on. That data is +# required for the gazelle_python_manifest rule to update our manifest file. +modules_mapping( + name = "modules_map", + wheels = all_whl_requirements, +) + # Gazelle python extension needs a manifest file mapping from # an import to the installed package that provides it. # This macro produces two targets: @@ -12,7 +21,7 @@ load("@rules_python//python:defs.bzl", "py_library") # the manifest doesn't need to be updated gazelle_python_manifest( name = "gazelle_python_manifest", - modules_mapping = "@modules_map//:modules_mapping.json", + modules_mapping = ":modules_map", pip_deps_repository_name = "pip", requirements = "//:requirements_lock.txt", ) diff --git a/examples/build_file_generation/WORKSPACE b/examples/build_file_generation/WORKSPACE index 42559329b5..c58b50f21b 100644 --- a/examples/build_file_generation/WORKSPACE +++ b/examples/build_file_generation/WORKSPACE @@ -60,14 +60,3 @@ pip_install( load("@rules_python//gazelle:deps.bzl", _py_gazelle_deps = "gazelle_deps") _py_gazelle_deps() - -load("@rules_python//gazelle/modules_mapping:def.bzl", "modules_mapping") - -# This repository rule fetches the metadata for python packages we -# depend on. That data is required for the gazelle_python_manifest -# rule to update our manifest file. -# To see what this rule does, try `bazel run @modules_map//:print` -modules_mapping( - name = "modules_map", - requirements = "//:requirements_lock.txt", -) diff --git a/gazelle/README.md b/gazelle/README.md index 9edf773a28..20584583c1 100644 --- a/gazelle/README.md +++ b/gazelle/README.md @@ -9,11 +9,8 @@ that generates BUILD file content for Python code. First, you'll need to add Gazelle to your `WORKSPACE` file. Follow the instructions at https://github.com/bazelbuild/bazel-gazelle#running-gazelle-with-bazel -Next, we need to add two more things to the `WORKSPACE`: - -1. fetch the third-party Go libraries that the python extension depends on -1. fetch metadata about your Python dependencies, so that gazelle can - determine which package a given import statement comes from. +Next, we need to fetch the third-party Go libraries that the python extension +depends on. Add this to your `WORKSPACE`: @@ -23,22 +20,12 @@ Add this to your `WORKSPACE`: load("@rules_python//gazelle:deps.bzl", _py_gazelle_deps = "gazelle_deps") _py_gazelle_deps() - -load("@rules_python//gazelle/modules_mapping:def.bzl", "modules_mapping") - -# This repository rule fetches the metadata for python packages we -# depend on. That data is required for the gazelle_python_manifest -# rule to update our manifest file. -# To see what this rule does, try `bazel run @modules_map//:print` -modules_mapping( - name = "modules_map", - # This should point to wherever we declare our python dependencies - requirements = "//:requirements_lock.txt", -) ``` -Next, we'll make a pair of targets for consuming that `modules_mapping` we -fetched, and writing it as a manifest file for Gazelle to read. +Next, we'll fetch metadata about your Python dependencies, so that gazelle can +determine which package a given import statement comes from. This is provided +by the `modules_mapping` rule. We'll make a target for consuming this +`modules_mapping`, and writing it as a manifest file for Gazelle to read. This is checked into the repo for speed, as it takes some time to calculate in a large monorepo. @@ -48,7 +35,16 @@ file. (You can just use `touch` at this point, it just needs to exist.) Then put this in your `BUILD.bazel` file next to the `requirements.txt`: ```starlark +load("@pip//:requirements.bzl", "all_whl_requirements") load("@rules_python//gazelle/manifest:defs.bzl", "gazelle_python_manifest") +load("@rules_python//gazelle/modules_mapping:def.bzl", "modules_mapping") + +# This rule fetches the metadata for python packages we depend on. That data is +# required for the gazelle_python_manifest rule to update our manifest file. +modules_mapping( + name = "modules_map", + wheels = all_whl_requirements, +) # Gazelle python extension needs a manifest file mapping from # an import to the installed package that provides it. @@ -59,9 +55,7 @@ load("@rules_python//gazelle/manifest:defs.bzl", "gazelle_python_manifest") # the manifest doesn't need to be updated gazelle_python_manifest( name = "gazelle_python_manifest", - # The @modules_map refers to the name we gave in the modules_mapping - # rule in the WORKSPACE - modules_mapping = "@modules_map//:modules_mapping.json", + modules_mapping = ":modules_map", # This is what we called our `pip_install` rule, where third-party # python libraries are loaded in BUILD files. pip_deps_repository_name = "pip", diff --git a/gazelle/modules_mapping/BUILD.bazel b/gazelle/modules_mapping/BUILD.bazel index 4ce6a0001e..d1cd42e7d9 100644 --- a/gazelle/modules_mapping/BUILD.bazel +++ b/gazelle/modules_mapping/BUILD.bazel @@ -1,4 +1,7 @@ -exports_files([ - "builder.py", - "generator.py", -]) +load("@rules_python//python:defs.bzl", "py_binary") + +py_binary( + name = "generator", + srcs = ["generator.py"], + visibility = ["//visibility:public"], +) diff --git a/gazelle/modules_mapping/builder.py b/gazelle/modules_mapping/builder.py deleted file mode 100644 index 3b471c0e09..0000000000 --- a/gazelle/modules_mapping/builder.py +++ /dev/null @@ -1,70 +0,0 @@ -import argparse -import multiprocessing -import subprocess -import sys -from datetime import datetime - -mutex = multiprocessing.Lock() - - -def build(wheel): - print("{}: building {}".format(datetime.now(), wheel), file=sys.stderr) - process = subprocess.run( - [sys.executable, "-m", "build", "--wheel", "--no-isolation"], cwd=wheel - ) - if process.returncode != 0: - # If the build without isolation fails, try to build it again with - # isolation. We need to protect this following logic in two ways: - # 1. Only build one at a time in this process. - # 2. Retry a few times to get around flakiness. - success = False - for _ in range(0, 3): - with mutex: - process = subprocess.run( - [sys.executable, "-m", "build", "--wheel"], - encoding="utf-8", - cwd=wheel, - capture_output=True, - ) - if process.returncode != 0: - continue - success = True - break - if not success: - print("STDOUT:", file=sys.stderr) - print(process.stdout, file=sys.stderr) - print("STDERR:", file=sys.stderr) - print(process.stderr, file=sys.stderr) - raise RuntimeError( - "{}: ERROR: failed to build {}".format(datetime.now(), wheel) - ) - - -def main(jobs, wheels): - with multiprocessing.Pool(jobs) as pool: - results = [] - for wheel in wheels: - result = pool.apply_async(build, args=(wheel,)) - results.append(result) - pool.close() - for result in results: - result.get() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Builds Python wheels.") - parser.add_argument( - "wheels", - metavar="wheel", - type=str, - nargs="+", - help="A path to the extracted wheel directory.", - ) - parser.add_argument( - "--jobs", - type=int, - default=8, - help="The number of concurrent build jobs to be executed.", - ) - args = parser.parse_args() - exit(main(args.jobs, args.wheels)) diff --git a/gazelle/modules_mapping/def.bzl b/gazelle/modules_mapping/def.bzl index e01ebd3506..e90d4546af 100644 --- a/gazelle/modules_mapping/def.bzl +++ b/gazelle/modules_mapping/def.bzl @@ -7,324 +7,39 @@ distribution should be used in the `deps` attribute of `py_*` targets. This mapping is necessary when reading Python import statements and determining if they are provided by third-party dependencies. Most importantly, when the module name doesn't match the wheel distribution name. - -Currently, this module only works with requirements.txt files locked using -pip-tools (https://github.com/jazzband/pip-tools) with hashes. This is necessary -in order to keep downloaded wheels in the Bazel cache. Also, the -modules_mapping rule does not consider extras as specified by PEP 508. """ -# _modules_mapping_impl is the root entry for the modules_mapping rule -# implementation. -def _modules_mapping_impl(rctx): - requirements_data = rctx.read(rctx.attr.requirements) - python_interpreter = _get_python_interpreter(rctx) - pythonpath = "{}/__pythonpath".format(rctx.path("")) - res = rctx.execute( - [ - python_interpreter, - "-m", - "pip", - "--verbose", - "--isolated", - "install", - "--target={}".format(pythonpath), - "--upgrade", - "--no-build-isolation", - "--no-cache-dir", - "--disable-pip-version-check", - "--index-url={}".format(rctx.attr.pip_index_url), - "build=={}".format(rctx.attr.build_wheel_version), - "setuptools=={}".format(rctx.attr.setuptools_wheel_version), - ], - quiet = rctx.attr.quiet, - timeout = rctx.attr.install_build_timeout, - ) - if res.return_code != 0: - fail(res.stderr) - parsed_requirements = _parse_requirements_txt(requirements_data) - wheels = _get_wheels(rctx, python_interpreter, pythonpath, parsed_requirements) - res = rctx.execute( - [ - python_interpreter, - rctx.path(rctx.attr._generator), - ] + wheels, - quiet = rctx.attr.quiet, - timeout = rctx.attr.generate_timeout, +def _modules_mapping_impl(ctx): + modules_mapping = ctx.actions.declare_file(ctx.attr.modules_mapping_name) + args = ctx.actions.args() + args.add(modules_mapping.path) + args.add_all([whl.path for whl in ctx.files.wheels]) + ctx.actions.run( + inputs = ctx.files.wheels, + outputs = [modules_mapping], + executable = ctx.executable._generator, + arguments = [args], + use_default_shell_env = False, ) - if res.return_code != 0: - fail(res.stderr) - rctx.file("modules_mapping.json", content = res.stdout) - rctx.file("print.sh", content = "#!/usr/bin/env bash\ncat $1", executable = True) - rctx.file("BUILD", """\ -exports_files(["modules_mapping.json"]) - -sh_binary( - name = "print", - srcs = ["print.sh"], - data = [":modules_mapping.json"], - args = ["$(rootpath :modules_mapping.json)"], -) -""") - -# _get_python_interpreter determines whether the system or the user-provided -# Python interpreter should be used and returns the path to be called. -def _get_python_interpreter(rctx): - if rctx.attr.python_interpreter == None: - return "python" - return rctx.path(rctx.attr.python_interpreter) - -# _parse_requirements_txt parses the requirements.txt data into structs with the -# information needed to download them using Bazel. -def _parse_requirements_txt(data): - result = [] - lines = data.split("\n") - current_requirement = "" - continue_previous_line = False - for line in lines: - # Ignore empty lines and comments. - if len(line) == 0 or line.startswith("#"): - continue - - line = line.strip() - - stripped_backslash = False - if line.endswith("\\"): - line = line[:-1] - stripped_backslash = True - - # If this line is a continuation of the previous one, append the current - # line to the current requirement being processed, otherwise, start a - # new requirement. - if continue_previous_line: - current_requirement += line - else: - current_requirement = line - - # Control whether the next line in the requirements.txt should be a - # continuation of the current requirement being processed or not. - continue_previous_line = stripped_backslash - if not continue_previous_line: - result.append(_parse_requirement(current_requirement)) - return result - -# _parse_requirement parses a single requirement line. -def _parse_requirement(requirement_line): - split = requirement_line.split("==") - requirement = {} - - # Removing the extras (https://www.python.org/dev/peps/pep-0508/#extras) - # from the requirement name is fine since it's expected that the - # requirements.txt was compiled with pip-tools, which includes the extras as - # direct dependencies. - name = _remove_extras_from_name(split[0]) - requirement["name"] = name - if len(split) == 1: - return struct(**requirement) - split = split[1].split(" ") - requirement["version"] = split[0] - if len(split) == 1: - return struct(**requirement) - args = split[1:] - hashes = [] - for arg in args: - arg = arg.strip() - - # Skip empty arguments. - if len(arg) == 0: - continue - - # Halt processing if it hits a comment. - if arg.startswith("#"): - break - if arg.startswith("--hash="): - hashes.append(arg[len("--hash="):]) - requirement["hashes"] = hashes - return struct(**requirement) - -# _remove_extras_from_name removes the [extras] from a requirement. -# https://www.python.org/dev/peps/pep-0508/#extras -def _remove_extras_from_name(name): - bracket_index = name.find("[") - if bracket_index == -1: - return name - return name[:bracket_index] - -# _get_wheels returns the wheel distributions for the given requirements. It -# uses a few different strategies depending on whether compiled wheel -# distributions exist on the remote index or not. The order in which it -# operates: -# -# 1. Try to use the platform-independent compiled wheel (*-none-any.whl). -# 2. Try to use the first match of the linux-dependent compiled wheel from the -# sorted releases list. This is valid as it's deterministic and the Python -# extension for Gazelle doesn't support other platform-specific wheels -# (one must use manual means to accomplish platform-specific dependency -# resolution). -# 3. Use the published source for the wheel. -def _get_wheels(rctx, python_interpreter, pythonpath, requirements): - wheels = [] - to_build = [] - for requirement in requirements: - if not hasattr(requirement, "hashes"): - if hasattr(requirement, "name") and requirement.name.startswith("#"): - # This is a comment in the requirements file. - continue - else: - fail("missing requirement hash for {}-{}: use pip-tools to produce a locked file".format( - requirement.name, - requirement.version, - )) - - wheel = {} - wheel["name"] = requirement.name - - requirement_info_url = "{index_base}/{name}/{version}/json".format( - index_base = rctx.attr.index_base, - name = requirement.name, - version = requirement.version, - ) - requirement_info_path = "{}_info.json".format(requirement.name) + return [DefaultInfo(files = depset([modules_mapping]))] - # TODO(f0rmiga): if the logs are too spammy, use rctx.execute with - # Python to perform the downloads since it's impossible to get the - # checksums of these JSON files and there's no option to mute Bazel - # here. - rctx.download(requirement_info_url, output = requirement_info_path) - requirement_info = json.decode(rctx.read(requirement_info_path)) - if requirement.version in requirement_info["releases"]: - wheel["version"] = requirement.version - elif requirement.version.endswith(".0") and requirement.version[:-len(".0")] in requirement_info["releases"]: - wheel["version"] = requirement.version[:-len(".0")] - else: - fail("missing requirement version \"{}\" for wheel \"{}\" in fetched releases: available {}".format( - requirement.version, - requirement.name, - [version for version in requirement_info["releases"]], - )) - releases = sorted(requirement_info["releases"][wheel["version"]], key = _sort_release_by_url) - (wheel_url, sha256) = _search_url(releases, "-none-any.whl") - - # TODO(f0rmiga): handle PEP 600. - # https://www.python.org/dev/peps/pep-0600/ - if not wheel_url: - # Search for the Linux tag as defined in PEP 599. - (wheel_url, sha256) = _search_url(releases, "manylinux2014_x86_64") - if not wheel_url: - # Search for the Linux tag as defined in PEP 571. - (wheel_url, sha256) = _search_url(releases, "manylinux2010_x86_64") - if not wheel_url: - # Search for the Linux tag as defined in PEP 513. - (wheel_url, sha256) = _search_url(releases, "manylinux1_x86_64") - if not wheel_url: - # Search for the MacOS tag - (wheel_url, sha256) = _search_url(releases, "macosx_10_9_x86_64") - - if wheel_url: - wheel_path = wheel_url.split("/")[-1] - rctx.download(wheel_url, output = wheel_path, sha256 = sha256) - wheel["path"] = wheel_path - else: - extension = ".tar.gz" - (src_url, sha256) = _search_url(releases, extension) - if not src_url: - extension = ".zip" - (src_url, sha256) = _search_url(releases, extension) - if not src_url: - fail("requirement URL for {}-{} not found".format(requirement.name, wheel["version"])) - rctx.download_and_extract(src_url, sha256 = sha256) - sanitized_name = requirement.name.lower().replace("-", "_") - requirement_path = src_url.split("/")[-1] - requirement_path = requirement_path[:-len(extension)] - - # The resulting filename for the .whl file is not feasible to - # predict as it has too many variations, so we defer it to the - # Python globing to find the right file name since only one .whl - # file should be generated by the compilation. - wheel_path = "{}/**/*.whl".format(requirement_path) - wheel["path"] = wheel_path - to_build.append(requirement_path) - - wheels.append(json.encode(wheel)) - - if len(to_build) > 0: - res = rctx.execute( - [python_interpreter, rctx.path(rctx.attr._builder)] + to_build, - quiet = rctx.attr.quiet, - environment = { - # To avoid use local "pip.conf" - "HOME": str(rctx.path("").realpath), - # Make uses of pip to use the requested index - "PIP_INDEX_URL": rctx.attr.pip_index_url, - "PYTHONPATH": pythonpath, - }, - ) - if res.return_code != 0: - fail(res.stderr) - - return wheels - -# _sort_release_by_url is the custom function for the key property of the sorted -# releases. -def _sort_release_by_url(release): - return release["url"] - -# _search_url searches for a release in the list of releases that has a url -# matching the provided extension. -def _search_url(releases, extension): - for release in releases: - url = release["url"] - if url.find(extension) >= 0: - return (url, release["digests"]["sha256"]) - return (None, None) - -modules_mapping = repository_rule( +modules_mapping = rule( _modules_mapping_impl, attrs = { - "build_wheel_version": attr.string( - default = "0.5.1", - doc = "The build wheel version.", - ), - "generate_timeout": attr.int( - default = 30, - doc = "The timeout for the generator.py command.", - ), - "index_base": attr.string( - default = "https://pypi.org/pypi", - doc = "The base URL used for querying releases data as JSON.", - ), - "install_build_timeout": attr.int( - default = 30, - doc = "The timeout for the `pip install build` command.", - ), - "pip_index_url": attr.string( - default = "https://pypi.python.org/simple", - doc = "The index URL used for any pip install actions", - ), - "python_interpreter": attr.label( - allow_single_file = True, - doc = "If set, uses the custom-built Python interpreter, otherwise, uses the system one.", - ), - "quiet": attr.bool( - default = True, - doc = "Toggle this attribute to get verbose output from this rule.", - ), - "requirements": attr.label( - allow_single_file = True, - doc = "The requirements.txt file with hashes locked using pip-tools.", + "modules_mapping_name": attr.string( + default = "modules_mapping.json", + doc = "The name for the output JSON file.", + mandatory = False, + ), + "wheels": attr.label_list( + allow_files = True, + doc = "The list of wheels, usually the 'all_whl_requirements' from @//:requirements.bzl", mandatory = True, ), - "setuptools_wheel_version": attr.string( - default = "v57.5.0", - doc = "The setuptools wheel version.", - ), - "_builder": attr.label( - allow_single_file = True, - default = "//gazelle/modules_mapping:builder.py", - ), "_generator": attr.label( - allow_single_file = True, - default = "//gazelle/modules_mapping:generator.py", + cfg = "host", + default = "//gazelle/modules_mapping:generator", + executable = True, ), }, doc = "Creates a modules_mapping.json file for mapping module names to wheel distribution names.", diff --git a/gazelle/modules_mapping/generator.py b/gazelle/modules_mapping/generator.py index 6ee654cfff..b93f9689ec 100644 --- a/gazelle/modules_mapping/generator.py +++ b/gazelle/modules_mapping/generator.py @@ -1,4 +1,3 @@ -import glob import json import pathlib import sys @@ -7,29 +6,19 @@ # Generator is the modules_mapping.json file generator. class Generator: - stdout = None stderr = None + output_file = None - def __init__(self, stdout, stderr): - self.stdout = stdout + def __init__(self, stderr, output_file): self.stderr = stderr + self.output_file = output_file # dig_wheel analyses the wheel .whl file determining the modules it provides # by looking at the directory structure. - def dig_wheel(self, wheel): + def dig_wheel(self, whl): mapping = {} - wheel_paths = glob.glob(wheel["path"]) - assert len(wheel_paths) != 0, "wheel not found for {}: searched for {}".format( - wheel["name"], - wheel["path"], - ) - wheel_path = wheel_paths[0] - assert ( - "UNKNOWN" not in wheel_path - ), "unknown-named wheel found for {}: possibly bad compilation".format( - wheel["name"], - ) - with zipfile.ZipFile(wheel_path, "r") as zip_file: + wheel_name = get_wheel_name(whl) + with zipfile.ZipFile(whl, "r") as zip_file: for path in zip_file.namelist(): if is_metadata(path): continue @@ -40,32 +29,43 @@ def dig_wheel(self, wheel): # where this file is as an importable package. if path.endswith("/__init__.py"): module = path[: -len("/__init__.py")].replace("/", ".") - mapping[module] = wheel["name"] + mapping[module] = wheel_name # Always index the module file. if ext == ".so": # Also remove extra metadata that is embeded as part of # the file name as an extra extension. ext = "".join(pathlib.Path(path).suffixes) module = path[: -len(ext)].replace("/", ".") - mapping[module] = wheel["name"] + mapping[module] = wheel_name return mapping # run is the entrypoint for the generator. def run(self, wheels): mapping = {} - for wheel_json in wheels: - wheel = json.loads(wheel_json) + for whl in wheels: try: - mapping.update(self.dig_wheel(wheel)) + mapping.update(self.dig_wheel(whl)) except AssertionError as error: print(error, file=self.stderr) return 1 mapping_json = json.dumps(mapping) - print(mapping_json, file=self.stdout) - self.stdout.flush() + with open(self.output_file, "w") as f: + f.write(mapping_json) return 0 +def get_wheel_name(path): + pp = pathlib.PurePath(path) + if pp.suffix != ".whl": + raise RuntimeError( + "{} is not a valid wheel file name: the wheel doesn't follow ".format( + pp.name + ) + + "https://www.python.org/dev/peps/pep-0427/#file-name-convention" + ) + return pp.name[: pp.name.find("-")] + + # is_metadata checks if the path is in a metadata directory. # Ref: https://www.python.org/dev/peps/pep-0427/#file-contents. def is_metadata(path): @@ -74,6 +74,7 @@ def is_metadata(path): if __name__ == "__main__": - wheels = sys.argv[1:] - generator = Generator(sys.stdout, sys.stderr) + output_file = sys.argv[1] + wheels = sys.argv[2:] + generator = Generator(sys.stderr, output_file) exit(generator.run(wheels))