Skip to content

Commit

Permalink
toil(package managers:pip): refactor...
Browse files Browse the repository at this point in the history
- `_download_dependencies()`
- `_process_package_distributions()`
- `DistributionPackageInfo`

Also adjust unit tests

Signed-off-by: Ben Alkov <ben.alkov@redhat.com>
  • Loading branch information
ben-alkov committed Mar 1, 2024
1 parent 1b89e05 commit 82cc94c
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 139 deletions.
200 changes: 112 additions & 88 deletions cachi2/core/package_managers/pip.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,7 +1386,9 @@ def _split_hashes_from_options(cls, options: list[str]) -> tuple[list[str], list


def _download_dependencies(
output_dir: RootedPath, requirements_file: PipRequirementsFile, allow_binary: bool = False
output_dir: RootedPath,
requirements_file: PipRequirementsFile,
allow_binary: bool = False,
) -> list[dict[str, Any]]:
"""
Download artifacts of all dependency packages in a requirements.txt file.
Expand All @@ -1400,6 +1402,7 @@ def _download_dependencies(
"""
options = _process_options(requirements_file.options)
trusted_hosts = set(options["trusted_hosts"])
downloaded: list[dict[str, Any]] = []

if options["require_hashes"]:
log.info("Global --require-hashes option used, will require hashes")
Expand All @@ -1419,72 +1422,98 @@ def _download_dependencies(
pip_deps_dir = output_dir.join_within_root("deps", "pip")
pip_deps_dir.path.mkdir(parents=True, exist_ok=True)

downloaded: list[dict[str, Any]] = []
to_download: list[DistributionPackageInfo] = []
def _classify_download_or_local(
artifacts: list[DistributionPackageInfo],
) -> tuple[list[DistributionPackageInfo], list[DistributionPackageInfo]]:
# check if artifacts already exist locally
to_download = []
to_check = []

for artifact in artifacts:
if artifact.path.exists():
to_check.append(artifact)
log.info("Artifact '%s' found locally", artifact.path.name)
else:
to_download.append(artifact)
return to_download, to_check

for req in requirements_file.requirements:
log.info("Downloading %s", req.download_line)
def _hash_verify(path: Path, checksum_info: Iterable[ChecksumInfo]) -> None:
try:
must_match_any_checksum(path, checksum_info)
except PackageRejected:
path.unlink()
log.warning("Download '%s' was removed from the output directory", path.name)

def _process_download_info(
req: PipRequirement,
download_info: dict[str, Any],
dpi: Optional[DistributionPackageInfo] = None,
) -> dict[str, Any]:
download_info["kind"] = req.kind
download_info["requirement_file"] = str(requirements_file.file_path.subpath_from_root)
download_info["hash_verified"] = False

if req.kind == "pypi":
source, wheels = _process_package_distributions(req, pip_deps_dir, allow_binary)
if allow_binary:
# check if artifact already exists locally
to_download.extend(w for w in wheels if not w.path.exists())

if source is None:
# at least one wheel exists -> report in the SBOM
downloaded.append(
{
"package": req.package,
"version": req.version_specs[0][1],
"kind": req.kind,
"hash_verified": require_hashes,
"requirement_file": str(requirements_file.file_path.subpath_from_root),
}
)
continue
if dpi:
if dpi.should_verify_checksums():
_hash_verify(dpi.path, dpi.checksums_to_verify)
download_info["hash_verified"] = True

download_binary_file(source.url, source.path, auth=None)
_check_metadata_in_sdist(source.path)
download_info = source.download_info
if dpi.package_type == "sdist":
_check_metadata_in_sdist(dpi.path)

elif req.kind == "vcs":
download_info = _download_vcs_package(req, pip_deps_dir)
elif req.kind == "url":
download_info = _download_url_package(req, pip_deps_dir, trusted_hosts)
else:
# Should not happen
raise RuntimeError(f"Unexpected requirement kind: {req.kind!r}")
if require_hashes or req.kind == "url":
hashes = req.hashes or [req.qualifiers.get("cachito_hash", "")]
_hash_verify(download_info["path"], list(map(_to_checksum_info, hashes)))
download_info["hash_verified"] = True

log.info(
"Successfully downloaded %s to %s",
"-- Successfully downloaded/checked '%s' in path '%s'",
req.download_line,
download_info["path"].relative_to(output_dir),
)

if require_hashes or req.kind == "url":
hashes = req.hashes or [req.qualifiers["cachito_hash"]]
must_match_any_checksum(download_info["path"], list(map(_to_checksum_info, hashes)))
download_info["hash_verified"] = True
else:
download_info["hash_verified"] = False
return download_info

download_info["kind"] = req.kind
download_info["requirement_file"] = str(requirements_file.file_path.subpath_from_root)
downloaded.append(download_info)
for req in requirements_file.requirements:
log.info("* Processing requirement line '%s'", req.download_line)

if allow_binary:
log.info("Downloading %d wheel(s) ...", len(to_download))
files: dict[str, Union[str, PathLike[str]]] = {pkg.url: pkg.path for pkg in to_download}
asyncio.run(async_download_files(files, get_config().concurrency_limit))
if req.kind == "pypi":
artifacts = _process_package_distributions(req, pip_deps_dir, allow_binary)
artifacts_to_download, artifacts_to_check = _classify_download_or_local(artifacts)

files: dict[str, Union[str, PathLike[str]]] = {
dpi.url: dpi.path for dpi in artifacts_to_download
}
asyncio.run(async_download_files(files, get_config().concurrency_limit))

for dpi in artifacts_to_download:
log.info("-- Processing pypi download '%s'", dpi.path.name)
download_info = dpi.download_info
download_info = _process_download_info(req, download_info, dpi=dpi)
downloaded.append(download_info)

for dpi in artifacts_to_check:
log.info("-- Checking existing pypi artifact '%s'", dpi.path.name)
download_info = dpi.download_info
download_info = _process_download_info(req, download_info, dpi=dpi)
downloaded.append(download_info)

for pkg in to_download:
try:
if pkg.should_verify_checksums():
must_match_any_checksum(pkg.path, pkg.checksums_to_verify)
except PackageRejected:
pkg.path.unlink()
log.warning("Download '%s' was removed from the output directory", pkg.path.name)
elif req.kind == "vcs":
log.info("-- Processing vcs repo '%s'", dpi.path.name)
download_info = _download_vcs_package(req, pip_deps_dir)
download_info = _process_download_info(req, download_info)
downloaded.append(download_info)
elif req.kind == "url":
log.info("-- Processing url download '%s'", dpi.path.name)
download_info = _download_url_package(req, pip_deps_dir, trusted_hosts)
download_info = _process_download_info(req, download_info)
downloaded.append(download_info)
else:
# Should not happen
raise RuntimeError(f"Unexpected requirement kind: '{req.kind!r}'")

log.info("* Finished processing requirement line '%s'\n", req.download_line)

return downloaded

Expand Down Expand Up @@ -1688,8 +1717,7 @@ class DistributionPackageInfo:
checksums_to_verify: set[ChecksumInfo] = field(init=False, default_factory=set)

def __post_init__(self) -> None:
if self.package_type == "wheel":
self.checksums_to_verify = self._determine_checksums_to_verify()
self.checksums_to_verify = self._determine_checksums_to_verify()

def _determine_checksums_to_verify(self) -> set[ChecksumInfo]:
"""Determine the set of checksums to verify for a given distribution package."""
Expand All @@ -1711,15 +1739,15 @@ def _determine_checksums_to_verify(self) -> set[ChecksumInfo]:
log.debug("%s: %s", self.path.name, msg)
return checksums

def should_download_wheel(self) -> bool:
"""Determine if the wheel should be downloaded.
def should_download(self) -> bool:
"""Determine if this artifact should be downloaded.
If the user specified any checksums, but they do not match with those
reported by PyPI, we do not want to download the wheel.
reported by PyPI, we do not want to download the artifact.
Otherwise, we do.
"""
return self.package_type == "wheel" and (
return (
len(self.checksums_to_verify) > 0
or len(self.pypi_checksums) == 0
or len(self.user_checksums) == 0
Expand All @@ -1741,7 +1769,7 @@ def download_info(self) -> dict[str, Any]:

def _process_package_distributions(
requirement: PipRequirement, pip_deps_dir: RootedPath, allow_binary: bool = False
) -> tuple[Optional[DistributionPackageInfo], list[DistributionPackageInfo]]:
) -> list[DistributionPackageInfo]:
"""
Return a DistributionPackageInfo object | a list of DPI objects, for the provided pip package.
Expand All @@ -1751,40 +1779,39 @@ def _process_package_distributions(
Filter to find the best matching sdist artifact.
Process wheel artifacts.
:param requirement: which pip package to process
:param str pip_deps_dir:
:param bool allow_binary: process wheels?
:return: a single DistributionPackageInfo, or a list of DPI
:rtype: DistributionPackageInfo
"""
allowed_distros = ["sdist", "wheel"] if allow_binary else ["sdist"]
artifacts: list[DistributionPackageInfo] = []
best_sdist: list[DistributionPackageInfo] = []
client = pypi_simple.PyPISimple()
name = requirement.package
version = requirement.version_specs[0][1]
normalized_version = canonicalize_version(version)
user_checksums = set(map(_to_checksum_info, requirement.hashes))

client = pypi_simple.PyPISimple()
try:
timeout = get_config().requests_timeout
project_page = client.get_project_page(name, timeout)
packages = project_page.packages
packages: list[pypi_simple.DistributionPackage] = project_page.packages
except (requests.RequestException, pypi_simple.NoSuchProjectError) as e:
raise FetchError(f"PyPI query failed: {e}")

allowed_distros = ["sdist", "wheel"] if allow_binary else ["sdist"]
filtered_packages = filter(
lambda x: x.version is not None
and canonicalize_version(x.version) == normalized_version
and x.package_type is not None
and x.package_type in allowed_distros,
packages,
)

sdists: list[DistributionPackageInfo] = []
wheels: list[DistributionPackageInfo] = []
def _is_valid(pkg: pypi_simple.DistributionPackage) -> bool:
return (
pkg.version is not None
and canonicalize_version(pkg.version) == normalized_version
and pkg.package_type is not None
and pkg.package_type in allowed_distros
)

user_checksums = set(map(_to_checksum_info, requirement.hashes))
valid_packages = (package for package in packages if _is_valid(package))

for package in filtered_packages:
for package in valid_packages:
pypi_checksums = {
ChecksumInfo(algorithm, digest) for algorithm, digest in package.digests.items()
}
Expand All @@ -1800,17 +1827,16 @@ def _process_package_distributions(
user_checksums,
)

if dpi.package_type == "sdist":
sdists.append(dpi)
if dpi.should_download():
artifacts.append(dpi)
else:
if dpi.should_download_wheel():
wheels.append(dpi)
else:
log.info("Filtering out %s due to checksum mismatch", package.filename)
log.info("Filtering out %s due to checksum mismatch", package.filename)

if len(sdists) != 0:
best_sdist = max(sdists, key=_sdist_preference)
if best_sdist.is_yanked:
sdists = [dpi for dpi in artifacts if dpi.package_type == "sdist"]
wheels = [dpi for dpi in artifacts if dpi.package_type == "wheel"]
if sdists:
best_sdist.append(max(sdists, key=_sdist_preference))
if best_sdist[0].is_yanked:
raise PackageRejected(
f"All sdists for package {name}=={version} are yanked",
solution=(
Expand All @@ -1822,7 +1848,6 @@ def _process_package_distributions(
)
else:
log.warning("No sdist found for package %s==%s", name, version)
best_sdist = None

if len(wheels) == 0:
if allow_binary:
Expand All @@ -1840,14 +1865,13 @@ def _process_package_distributions(
"Alternatively, allow the use of wheels."
)
docs = PIP_NO_SDIST_DOC

raise PackageRejected(
f"No distributions found for package {name}=={version}",
solution=solution,
docs=docs,
)

return best_sdist, wheels
return best_sdist + wheels


def _sdist_preference(sdist_pkg: DistributionPackageInfo) -> tuple[int, int]:
Expand Down
Loading

0 comments on commit 82cc94c

Please sign in to comment.