Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove bad packages instead of failing #9

Merged
merged 10 commits into from
Nov 8, 2016
153 changes: 150 additions & 3 deletions conda_mirror/conda_mirror.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import (unicode_literals, print_function, division,
absolute_import)
import requests
import argparse
import logging
Expand All @@ -9,7 +11,9 @@
import tqdm
from collections import deque
from conda_build.config import Config
from conda_build.index import update_index
from conda_build.index import update_index,read_index_tar
import fnmatch
import tarfile


logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -46,6 +50,14 @@ def get_repodata(channel, platform):


def _make_arg_parser():
"""
Localize the ArgumentParser logic

Returns
-------
argument_parser : argparse.ArgumentParser
The instantiated argument parser for this CLI
"""
ap = argparse.ArgumentParser(description="CLI interface for conda-mirror.py")

ap.add_argument(
Expand All @@ -68,6 +80,9 @@ def _make_arg_parser():


def cli():
"""
Collect arguments from sys.argv and invoke the main() function.
"""
ap = _make_arg_parser()
args = ap.parse_args()
if 'all' in args.platform and len(args.platform) != 1:
Expand All @@ -77,6 +92,25 @@ def cli():


def not_in_upstream(local_repo_metadata, upstream_repo_metadata):
"""
Produce a stream of packages that exist on the upstream channel but
not the local mirror

Parameters
----------
local_repo_metadata : dict
This is the 'packages' key from the repodata.json file
from the local channel
upstream_repo_metadata : dict
This is the 'packages' key from the repodata.json file
from the upstream channel

Yields
------
package_name : str
A continuous stream of package names that exist on the upstream channel
but not the local one
"""
upstream_package_names = set(upstream_repo_metadata.keys())
local_package_names = set(local_repo_metadata.keys())
for pkg in upstream_package_names.difference(local_package_names):
Expand All @@ -85,6 +119,28 @@ def not_in_upstream(local_repo_metadata, upstream_repo_metadata):

def not_blacklisted_license(package_names_to_mirror, upstream_repo_metadata,
bad_licenses=None):
"""
Trim list of packages to mirror based on their listed licenses

Parameters
----------
package_names_to_mirror : iterable
An iterable of package names to check and see if they have unfriendly
licenses. These package names should be keys in the
`upstream_repo_metadata` dict
upstream_repo_metadata : dict
The 'packages' value of the repodata.json dict for the upstream channel
that we are mirroring locally
bad_licenses: iterable, optional
All licenses that are considered "bad". Packages whose licenses are
in `bad_licenses` will not be mirrored locally

Yields
------
package_name : str
A continuous stream of package names whose licenses do not match those
in `bad_licenses`
"""
if bad_licenses is None:
bad_licenses = DEFAULT_BAD_LICENSES

Expand All @@ -106,6 +162,20 @@ def not_blacklisted_license(package_names_to_mirror, upstream_repo_metadata,


def main(upstream_channel, target_directory, platform):
"""
The business logic of conda_mirror.

Parameters
----------
upstream_channel : str
The anaconda.org channel that you want to mirror locally
e.g., "anaconda" or "conda-forge"
target_directory : str
The path on disk to produce a local mirror of the upstream channel
platform : iterable
The platforms that you want to mirror from anaconda.org/<upstream_channel>
The defaults are listed in the module level global "DEFAULT_PLATFORMS"
"""
full_platform_list = copy.copy(platform)
if 'all' in full_platform_list:
full_platform_list.remove('all')
Expand Down Expand Up @@ -152,7 +222,7 @@ def main(upstream_channel, target_directory, platform):
platform=platform,
file_name=package,
)
print(url)
logging.info("download_url={}".format(url))
expected_size = info['size']
chunk_size = 1024 # 1KB chunks
expected_iterations = expected_size // chunk_size + 1
Expand All @@ -175,11 +245,88 @@ def main(upstream_channel, target_directory, platform):
logging.info("The packages that were mirrored are:")
logging.info(pformat(mirrored_packages))


def run_conda_index(target_directory):
"""
Call out to conda_build.index:update_index

Parameters
----------
target_directory : str
The full path to the platform subdirectory inside of the local conda
channel. The directory at this path should contain a "repodata.json" file
e.g., /path/to/local/repo/linux-64
"""
logging.info("Indexing {}".format(target_directory))
config = Config()
config.timeout=1
update_index(target_directory, config, could_be_mirror=False)
try:
update_index(target_directory, config, could_be_mirror=False)
except RuntimeError as re:
# ['Could', 'not', 'extract', 'upstream-mirror/linux-64/numpy-1.7.1-py27_p0.tar.bz2.', 'File', 'probably', 'corrupt.']
err_msg = str(re).split()
# find the one that looks like a filename
fname, = fnmatch.filter(err_msg, "*.tar.bz2*")
# and drop the trailing '.'
if fname.endswith('.'):
fname = fname[:-1]
logging.info("Caught an exception while trying to index: {}".format(re))
logging.info("Removing: {}".format(fname))
_remove_package(fname)
run_conda_index(target_directory)
except tarfile.ReadError as re:
# Find the new packages that don't exist in the repodata
bad_package, = _find_bad_package(target_directory)
_remove_package(bad_package)
run_conda_index(target_directory)


def _find_bad_package(local_platform_directory):
"""
Find the exact package that is causing a `tarfile.ReadError`

Parameters
----------
local_platform_directory : str
Path to one of the platform subdirectories of a local conda channel
e.g., this is the folder that should contain all of the conda packages
and a "repodata.json"

Yields
------
full_pkg_path : str
The full path to a package that results in `conda_build.index:read_index_tar()`
raising a tarfile.ReadError
"""
repodata_fname = os.path.join(local_platform_directory, 'repodata.json')
with open(repodata_fname, 'r') as f:
repodata = json.load(f)
repodata_info, repodata_packages = repodata.get('info', {}), repodata.get('packages', {})
indexed_packages = list(repodata_packages.keys())
all_packages = fnmatch.filter(os.listdir(local_platform_directory), "*.tar.bz2")
potentially_bad = set(all_packages).difference(indexed_packages)
for pkg in potentially_bad:
full_pkg_path = os.path.join(local_platform_directory, pkg)
try:
read_index_tar(full_pkg_path, Config())
except tarfile.ReadError as re:
msg = "tarfile.ReadError encountered. Original error: {}".format(re)
msg += "\nRemoving bad package: {}".format(full_pkg_path)
logging.error(msg)
yield full_pkg_path


def _remove_package(pkg_path):
"""
Log and remove a package.

Parameters
----------
pkg_path : str
Path to a conda package that should be removed
"""
logging.info("Removing: {}".format(pkg_path))
os.remove(pkg_path)


if __name__ == "__main__":
Expand Down
21 changes: 21 additions & 0 deletions test/test_conda_mirror.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from __future__ import (unicode_literals, print_function, division,
absolute_import)
from conda_mirror import conda_mirror
import pytest
import requests_mock
import os
import subprocess
from contextlib import contextmanager
import sys
import bz2
import copy

@pytest.fixture(scope='session')
Expand Down Expand Up @@ -114,3 +117,21 @@ def test_cli(local_repo_root, tmpdir):
assert "b-1-0.tar.bz2" in contents
assert "c-1-0.tar.bz2" not in contents
assert "d-1-0.tar.bz2" not in contents


def test_handling_bad_package(local_repo_root):
bad_pkg_root = os.path.join(local_repo_root, 'linux-64')
bad_pkg_name = 'bad-1-0.tar.bz2'
bad_pkg_path = os.path.join(bad_pkg_root, bad_pkg_name)
if os.path.exists(bad_pkg_path):
os.remove(bad_pkg_path)

with bz2.BZ2File(bad_pkg_path, 'wb') as f:
f.write("This is a fake package".encode())

assert bad_pkg_name in os.listdir(bad_pkg_root)

conda_mirror.run_conda_index(bad_pkg_root)

assert bad_pkg_name not in os.listdir(bad_pkg_root)