Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WiP: Incremental subsetting of PEX lock #14923

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions src/python/pants/backend/python/goals/pytest_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pants.backend.python.util_rules.interpreter_constraints import InterpreterConstraints
from pants.backend.python.util_rules.local_dists import LocalDistsPex, LocalDistsPexRequest
from pants.backend.python.util_rules.pex import Pex, PexRequest, VenvPex, VenvPexProcess
from pants.backend.python.util_rules.pex_from_targets import RequirementsPexRequest
from pants.backend.python.util_rules.pex_from_targets import PexReqs, PexReqsRequest
from pants.backend.python.util_rules.python_sources import (
PythonSourceFiles,
PythonSourceFilesRequest,
Expand All @@ -33,7 +33,7 @@
)
from pants.core.util_rules.config_files import ConfigFiles, ConfigFilesRequest
from pants.core.util_rules.source_files import SourceFiles, SourceFilesRequest
from pants.engine.addresses import Address
from pants.engine.addresses import Address, Addresses
from pants.engine.collection import Collection
from pants.engine.environment import Environment, EnvironmentRequest
from pants.engine.fs import (
Expand Down Expand Up @@ -168,9 +168,7 @@ async def setup_pytest_for_target(

interpreter_constraints = InterpreterConstraints.create_from_targets(all_targets, python_setup)

requirements_pex_get = Get(
Pex, RequirementsPexRequest([request.field_set.address], internal_only=True)
)
pex_reqs_get = Get(PexReqs, PexReqsRequest(Addresses([request.field_set.address])))
pytest_pex_get = Get(
Pex,
PexRequest(
Expand Down Expand Up @@ -198,14 +196,14 @@ async def setup_pytest_for_target(

(
pytest_pex,
requirements_pex,
pex_reqs,
prepared_sources,
field_set_source_files,
field_set_extra_env,
extra_output_directory_digest,
) = await MultiGet(
pytest_pex_get,
requirements_pex_get,
pex_reqs_get,
prepared_sources_get,
field_set_source_files_get,
field_set_extra_env_get,
Expand All @@ -227,9 +225,10 @@ async def setup_pytest_for_target(
PexRequest(
output_filename="pytest_runner.pex",
interpreter_constraints=interpreter_constraints,
requirements=pex_reqs.requirements,
main=pytest.main,
internal_only=True,
pex_path=[pytest_pex, requirements_pex, local_dists.pex],
pex_path=[pytest_pex, local_dists.pex, *pex_reqs.pexes],
),
)
config_files_get = Get(
Expand Down
31 changes: 15 additions & 16 deletions src/python/pants/backend/python/lint/pylint/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@
VenvPexProcess,
VenvPexRequest,
)
from pants.backend.python.util_rules.pex_from_targets import RequirementsPexRequest
from pants.backend.python.util_rules.pex_from_targets import PexReqs, PexReqsRequest
from pants.backend.python.util_rules.python_sources import (
PythonSourceFiles,
PythonSourceFilesRequest,
)
from pants.core.goals.lint import REPORT_DIR, LintResult, LintResults, LintTargetsRequest
from pants.core.util_rules.config_files import ConfigFiles, ConfigFilesRequest
from pants.core.util_rules.source_files import SourceFiles, SourceFilesRequest
from pants.engine.addresses import Addresses
from pants.engine.collection import Collection
from pants.engine.fs import CreateDigest, Digest, Directory, MergeDigests, RemovePrefix
from pants.engine.process import FallibleProcessResult
Expand Down Expand Up @@ -74,18 +75,6 @@ def generate_argv(source_files: SourceFiles, pylint: Pylint) -> Tuple[str, ...]:
async def pylint_lint_partition(
partition: PylintPartition, pylint: Pylint, first_party_plugins: PylintFirstPartyPlugins
) -> LintResult:
requirements_pex_get = Get(
Pex,
RequirementsPexRequest(
(t.address for t in partition.root_targets),
# NB: These constraints must be identical to the other PEXes. Otherwise, we risk using
# a different version for the requirements than the other two PEXes, which can result
# in a PEX runtime error about missing dependencies.
hardcoded_interpreter_constraints=partition.interpreter_constraints,
internal_only=True,
),
)

pylint_pex_get = Get(
Pex,
PexRequest,
Expand All @@ -101,19 +90,28 @@ async def pylint_lint_partition(
)
# Ensure that the empty report dir exists.
report_directory_digest_get = Get(Digest, CreateDigest([Directory(REPORT_DIR)]))
pex_reqs_get = Get(
PexReqs, PexReqsRequest(
Addresses(t.address for t in partition.root_targets),
# NB: These constraints must be identical to the other PEXes. Otherwise, we risk using
# a different version for the requirements than the other two PEXes, which can result
# in a PEX runtime error about missing dependencies.
interpreter_constraints=partition.interpreter_constraints,
)
)

(
pylint_pex,
requirements_pex,
prepared_python_sources,
field_set_sources,
report_directory,
pex_reqs,
) = await MultiGet(
pylint_pex_get,
requirements_pex_get,
prepare_python_sources_get,
field_set_sources_get,
report_directory_digest_get,
pex_reqs_get,
)

pylint_runner_pex, config_files = await MultiGet(
Expand All @@ -122,10 +120,11 @@ async def pylint_lint_partition(
VenvPexRequest(
PexRequest(
output_filename="pylint_runner.pex",
requirements=pex_reqs.requirements,
interpreter_constraints=partition.interpreter_constraints,
main=pylint.main,
internal_only=True,
pex_path=[pylint_pex, requirements_pex],
pex_path=[pylint_pex, *pex_reqs.pexes],
),
# TODO(John Sirois): Remove this (change to the default of symlinks) when we can
# upgrade to a version of Pylint with https://github.com/PyCQA/pylint/issues/1470
Expand Down
30 changes: 12 additions & 18 deletions src/python/pants/backend/python/typecheck/mypy/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@
from pants.backend.python.util_rules import pex_from_targets
from pants.backend.python.util_rules.interpreter_constraints import InterpreterConstraints
from pants.backend.python.util_rules.pex import Pex, PexRequest, VenvPex, VenvPexProcess
from pants.backend.python.util_rules.pex_from_targets import RequirementsPexRequest
from pants.backend.python.util_rules.pex_from_targets import PexReqs, PexReqsRequest
from pants.backend.python.util_rules.pex_requirements import PexRequirements
from pants.backend.python.util_rules.python_sources import (
PythonSourceFiles,
PythonSourceFilesRequest,
)
from pants.core.goals.check import REPORT_DIR, CheckRequest, CheckResult, CheckResults
from pants.core.util_rules.source_files import SourceFiles, SourceFilesRequest
from pants.engine.addresses import Addresses
from pants.engine.collection import Collection
from pants.engine.fs import CreateDigest, Digest, FileContent, MergeDigests, RemovePrefix
from pants.engine.process import FallibleProcessResult
Expand Down Expand Up @@ -128,15 +129,13 @@ async def mypy_typecheck_partition(
SourceFilesRequest(tgt.get(PythonSourceField) for tgt in partition.root_targets),
)

# See `requirements_venv_pex` for how this will get wrapped in a `VenvPex`.
requirements_pex_get = Get(
Pex,
RequirementsPexRequest(
(tgt.address for tgt in partition.root_targets),
hardcoded_interpreter_constraints=partition.interpreter_constraints,
internal_only=True,
),
pex_reqs_get = Get(
PexReqs, PexReqsRequest(
Addresses(t.address for t in partition.root_targets),
interpreter_constraints=partition.interpreter_constraints,
)
)

extra_type_stubs_pex_get = Get(
Pex,
PexRequest(
Expand All @@ -156,18 +155,12 @@ async def mypy_typecheck_partition(
),
)

(
closure_sources,
roots_sources,
mypy_pex,
extra_type_stubs_pex,
requirements_pex,
) = await MultiGet(
(closure_sources, roots_sources, mypy_pex, extra_type_stubs_pex, pex_reqs,) = await MultiGet(
closure_sources_get,
roots_sources_get,
mypy_pex_get,
extra_type_stubs_pex_get,
requirements_pex_get,
pex_reqs_get,
)

python_files = determine_python_files(roots_sources.snapshot.files)
Expand All @@ -188,8 +181,9 @@ async def mypy_typecheck_partition(
VenvPex,
PexRequest(
output_filename="requirements_venv.pex",
requirements=pex_reqs.requirements,
internal_only=True,
pex_path=[requirements_pex, extra_type_stubs_pex],
pex_path=[extra_type_stubs_pex, *pex_reqs.pexes],
interpreter_constraints=partition.interpreter_constraints,
),
)
Expand Down
1 change: 1 addition & 0 deletions src/python/pants/backend/python/util_rules/pex.py
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,7 @@ async def build_pex(
)
requirement_count = _pex_lockfile_requirement_count(lock_bytes)
argv.extend(["--lock", lock_path])
argv.extend(request.requirements.req_strings)
else:
header_delimiter = "#"
# Note: this is a very naive heuristic. It will overcount because entries often
Expand Down
78 changes: 74 additions & 4 deletions src/python/pants/backend/python/util_rules/pex_from_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
CompletePlatforms,
OptionalPex,
OptionalPexRequest,
Pex,
PexPlatforms,
PexRequest,
_is_probably_pex_json_lockfile,
)
from pants.backend.python.util_rules.pex import rules as pex_rules
from pants.backend.python.util_rules.pex_requirements import Lockfile, PexRequirements
Expand All @@ -46,6 +48,7 @@
from pants.util.docutil import doc_url
from pants.util.logging import LogLevel
from pants.util.meta import frozen_after_init
from pants.util.ordered_set import FrozenOrderedSet
from pants.util.strutil import bullet_list, path_safe

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -193,6 +196,10 @@ class ChosenPythonResolve:
name: str
lockfile_path: str

@property
def description_of_origin(self) -> str:
return f"the resolve `{self.name}` (from `[python].resolves`)"
Comment on lines +199 to +201
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's probably other places this can be leveraged



@dataclass(frozen=True)
class ChosenPythonResolveRequest:
Expand Down Expand Up @@ -499,11 +506,10 @@ async def get_repository_pex(
internal_only=request.internal_only,
requirements=Lockfile(
file_path=chosen_resolve.lockfile_path,
file_path_description_of_origin=(
f"the resolve `{chosen_resolve.name}` (from `[python].resolves`)"
),
file_path_description_of_origin=chosen_resolve.description_of_origin,
resolve_name=chosen_resolve.name,
req_strings=request.requirements.req_strings,
# NB: A blank req_strings means install the entire lockfile
req_strings=FrozenOrderedSet([]),
Comment on lines +511 to +512
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be good to document on the Lockfile class rather-than/in-addition-to here. We haven't done a great job with docstrings on these internal APIs, but this in particular could be surprising.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it's only true if the Lockfile is a PEX lockfile AFAICT. 😵

),
interpreter_constraints=interpreter_constraints,
platforms=request.platforms,
Expand Down Expand Up @@ -657,5 +663,69 @@ async def get_requirements_pex(request: RequirementsPexRequest, setup: PythonSet
return pex_request


@dataclass(frozen=True)
class PexReqsRequest:
addresses: Addresses
interpreter_constraints: InterpreterConstraints | None = None



@dataclass(frozen=True)
class PexReqs:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

@stuhood stuhood Mar 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is all hairy, and I apologize. But I think that the result as it stands might be a bit off. AFAICT, the workflows look like:

  • If we are using the new optimization:
    1. Don't actually build a requirements-only PEX, and instead hold onto the requirements which would have been used
    2. When actually creating a PEX containing user code, embed the thirdparty requirements
  • If we are not using the optimization:
    1. Build a requirements-only PEX, and put it on the pex-path of another PEX containing user code.

The differentiation between "a requirements-only PEX" (which contains only thirdparty requirements) and the PEX for user code was intentional (although how important it is in practice is unclear), because thirdparty requirements and your code's transitive thirdparty deps change much less frequently than anything else.

So I think that you'll actually want to preserve that aspect, by adjusting the optimization to always create a RequirementsPex, but to do so directly from the Lockfile, rather than from a "repository PEX". That should (AFAICT?) avoid the need for the new type at all, because the existing RequirementsPex creation code can be adjusted directly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the benefit of building any PEXs at all with the new optimization? AFAICT building PEXs here puts us in the same boat as before w.r.t. maybe wanting run_against_entire_lockfile, because for each unique subset of 3rdparty deps we'll be building a PEX and that's non-ideal.

Unless you mean for goals like package? I'm pretty sure that code has no different behavior with this PR (although we should probably change that to do bullet point 1, subpoint ii. Today it might be PEXing the whole repo)

Sorry, really confused by this comment 😅


Don't actually build a requirements-only PEX, and instead hold onto the requirements which would have been used
...
Build a requirements-only PEX, and put it on the pex-path of another PEX containing user code.

That's what this PR is already doing

Copy link
Member

@stuhood stuhood Mar 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the benefit of building any PEXs at all with the new optimization? AFAICT building PEXs here puts us in the same boat as before w.r.t. maybe wanting run_against_entire_lockfile, because for each unique subset of 3rdparty deps we'll be building a PEX and that's non-ideal.

It's not possible to run (a PEX) without building a PEX currently. So in the end, a PEX is built, and then a venv is built from it to actually execute. The primary benefit of building the PEX is that it can be cached: i.e., if you lint or test twice in a row with no code changes, we'll hit the cache for the construction of the PEX, and the only thing that is invalidated is running the built PEX.

Building a requirements/third-party PEX independently means that as long as thirdparty requirements have not changed, you can hit the cache for the network-accessing/thirdparty portion of your build. Without that behavior, changes to either first party or third party code will go through PEX's resolution logic.

To see the difference (and again: note in my comment that "how important it is in practice is unclear"), you'd want to compare the difference between main and this branch for a case like: "I ran a test once, made a non-import edit to my test code, and then re-ran". On main, that edit won't actually affect any requirements, and so won't need to hit the network / run resolution logic / consume the lockfile at all: on this branch, it will re-run the PEX resolve. Warm runs like this are an important case for local latency, and we need to account for them (even at the expense of CI / cold performance sometimes).

Sorry, really confused by this comment 😅

Yea, sorry: it's definitely confusing.

Don't actually build a requirements-only PEX, and instead hold onto the requirements which would have been used
...
Build a requirements-only PEX, and put it on the pex-path of another PEX containing user code.

That's what this PR is already doing

Understood: I was trying to explain the behavior difference between main and this branch.

From my perspective, by far the most relevant optimization is skipping the creation of the "repository PEX" (which contains the entire lockfile)... it's not clear to me that adjusting the behavior where we currently create a thirdparty-requirements PEX is necessarily a win.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On main, that edit won't actually affect any requirements, and so won't need to hit the network / run resolution logic / consume the lockfile at all: on this branch, it will re-run the PEX resolve.

I'm not seeing that behavior.

joshuacannon@CEPHANDRIUS:~/work/techlabs$ ./pants_from_sources test path/to/a/test_file.py
19:28:19.52 [INFO] Completed: Building pytest.pex from 3rdparty/python/lockfiles/pytest.txt                                                                                 
19:28:19.92 [INFO] Completed: Building local_dists.pex                                                                                                                      
19:28:21.69 [INFO] Completed: Building pytest_runner.pex from 3rdparty/python/lockfiles/repository.lock                                                                     
19:28:22.14 [WARN] Failed to generate JUnit XML data for path/to/a/test_file.py.                                                                                 
19:28:22.14 [ERROR] Completed: Run Pytest - path/to/a/test_file.py failed (exit code 1).

Then I touch that file and sprinkle newlines

joshuacannon@CEPHANDRIUS:~/work/techlabs$ ./pants_from_sources test path/to/a/test_file.py
19:31:27.77 [WARN] Failed to generate JUnit XML data for path/to/a/test_file.py.                                                                                 
19:31:27.77 [ERROR] Completed: Run Pytest - path/to/a/test_file.py failed (exit code 1).

I can share the workunit JSON for proof too.


I think maybe the disconnect is we ARE still capturing the third-party reqs separate from first-party user code. pytest_runner.pex is a VenvPEX we're building which won't contain the user code. Same for pylint and mypy.

Let me profile the reqs.pex way to see if my fear about exploding cache or perf is valid. Then we can compare implementations / side-effects.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like both solutions have similar characteristics due to PEXs hardlinking/symlinking. So I think the difference between the two is now in the noise and we likely might want to switch gears into what we imagine the future state is, and which option best puts us in that direction.

Copy link
Member Author

@thejcannon thejcannon Mar 29, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the requirements.pex way doesn't involve touching client-code, I'm actually leaning that direction. Thanks for the push in the right direction @stuhood

requirements: Lockfile | PexRequirements
pexes: Iterable[Pex]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Most iterables would not be frozen and hashable, so consider using tuple here instead.



@rule
async def get_lockfile_subset(
request: PexReqsRequest,
python_setup: PythonSetup,
) -> PexReqs:
if python_setup.enable_resolves:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given the size of this if-block, consider inverting the condition and having the other branch be the early return.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #14923 (comment). I think I'll make this a rule.

chosen_resolve = await Get(
ChosenPythonResolve, ChosenPythonResolveRequest(request.addresses)
)
lock_path = chosen_resolve.lockfile_path
requirements_file_digest = await Get(
Digest,
PathGlobs(
[lock_path],
glob_match_error_behavior=GlobMatchErrorBehavior.error,
description_of_origin=chosen_resolve.description_of_origin,
),
)
_digest_contents = await Get(DigestContents, Digest, requirements_file_digest)
lock_bytes = _digest_contents[0].content
Comment on lines +685 to +698
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is currently duplicated in the giant rule in pex.py currently 😞 Should we make a new rule?


if _is_probably_pex_json_lockfile(lock_bytes):
if python_setup.run_against_entire_lockfile:
# NB: PEX treats no requirements as "install entire lockfile"
req_strings: FrozenOrderedSet[str] = FrozenOrderedSet()
# @TODO: complain deprecated
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I expect that we should wait at least one release to see whether users who are already using this setting are actually able to stop before we suggest that anyone does.

else:
requirements = await Get(
PexRequirements, _PexRequirementsRequest(request.addresses)
)
req_strings = requirements.req_strings

lockfile = Lockfile(
file_path=chosen_resolve.lockfile_path,
file_path_description_of_origin=chosen_resolve.description_of_origin,
resolve_name=chosen_resolve.name,
req_strings=req_strings,
)
return PexReqs(lockfile, ())

pex = await Get(
Pex,
RequirementsPexRequest(
(request.addresses),
hardcoded_interpreter_constraints=request.interpreter_constraints,
internal_only=True,
),
)
return PexReqs(PexRequirements(), (pex,))


def rules():
return (*collect_rules(), *pex_rules(), *local_dists_rules(), *python_sources_rules())