conda · dholth · May 10, 2024 · May 10, 2024 · May 10, 2024 · May 10, 2024
diff --git a/conda_index/__init__.py b/conda_index/__init__.py
@@ -2,4 +2,4 @@
 conda index. Create repodata.json for collections of conda packages.
 """
 
-__version__ = "0.5.0"
+__version__ = "0.6.0"
diff --git a/conda_index/cli/__init__.py b/conda_index/cli/__init__.py
@@ -4,12 +4,15 @@
 
 import logging
 import os.path
+from pathlib import Path
 
 import click
 
 from conda_index.index import MAX_THREADS_DEFAULT, ChannelIndex, logutil
 
 from .. import yaml
+from ..index.shards import ChannelIndexShards, ShardedIndexCache
+from ..index.sqlitecache import CondaIndexCache
 
 
 @click.command(context_settings={"help_option_names": ["-h", "--help"]})
@@ -91,6 +94,23 @@
         repodata_version=2 which is supported in conda 24.5.0 or later.
         """,
 )
+@click.option(
+    "--save-fs-state/--no-save-fs-state",
+    help="""
+        Skip using listdir() to refresh the set of available packages. Used to
+        generate complete repodata.json from cache only when packages are not on
+        disk.
+        """,
+    default=False,
+    show_default=True,
+)
+@click.option(
+    "--upstream-stage",
+    help="""
+    Set to 'clone' to generate example repodata from conda-forge test database.
+    """,
+    default="fs",
+)
 @click.option(
     "--current-repodata/--no-current-repodata",
     help="""
@@ -110,6 +130,14 @@
     default=False,
     is_flag=True,
 )
+@click.option(
+    "--sharded",
+    help="""
+        Write index using shards
+        """,
+    default=False,
+    is_flag=True,
+)
-@click.option(
-    "--sharded",
-    help="""
-        Write index using shards
-        """,
-    default=False,
-    is_flag=True,
-)
+@click.option(
+    "--write-shards/--no-write-shards",
+    help="""
+        Write a repodata.msgpack.zst index and many smaller files per CEP-16.
+        """,
+    default=False,
+    is_flag=True,
+)
-@click.option(
-    "--sharded",
-    help="""
-        Write index using shards
-        """,
-    default=False,
-    is_flag=True,
-)
+@click.option(
+    "--write-shards/--no-write-shards",
+    help="""
+        Write a repodata.msgpack.zst index and many smaller files per CEP-16.
+        """,
+    default=False,
+    is_flag=True,
+)
 def cli(
     dir,
     patch_generator=None,
@@ -126,7 +154,10 @@ def cli(
     run_exports=False,
     compact=True,
     base_url=None,
+    save_fs_state=False,
+    upstream_stage="fs",
     current_repodata=True,
+    sharded=False,
-    sharded=False,
+    write_shards=False,
-    sharded=False,
+    write_shards=False,
 ):
     logutil.configure()
     if verbose:
@@ -135,7 +166,10 @@ def cli(
     if output:
         output = os.path.expanduser(output)
 
-    channel_index = ChannelIndex(
+    channel_index_class = ChannelIndexShards if sharded else ChannelIndex
+    cache_class = ShardedIndexCache if sharded else CondaIndexCache
+
+    channel_index = channel_index_class(
         os.path.expanduser(dir),
         channel_name=channel_name,
         output_root=output,
@@ -146,14 +180,32 @@ def cli(
         write_run_exports=run_exports,
         compact_json=compact,
         base_url=base_url,
+        save_fs_state=save_fs_state,
         write_current_repodata=current_repodata,
+        cache_class=cache_class,
+        upstream_stage=upstream_stage,
     )
 
+    if save_fs_state is False:
+        # We call listdir() in save_fs_state, or its remote fs equivalent; then
+        # we call changed_packages(); but the changed_packages query against a
+        # remote filesystem is different than the one we need for a local
+        # filesystem. How about skipping the extract packages stage entirely by
+        # returning no changed packages? Might fail if we use
+        # threads/multiprocessing.
+        def no_changed_packages(self, *args):
+            return []
+
+        channel_index.cache_class.changed_packages = no_changed_packages
+
     current_index_versions = None
     if current_index_versions_file:
         with open(current_index_versions_file) as f:
             current_index_versions = yaml.safe_load(f)
 
+    if patch_generator:
+        patch_generator = str(Path(patch_generator).expanduser())
+
     channel_index.index(
         patch_generator=patch_generator,  # or will use outdated .py patch functions
         current_index_versions=current_index_versions,

diff --git a/conda_index/index/__init__.py b/conda_index/index/__init__.py
@@ -209,27 +209,32 @@ def _make_seconds(timestamp):
 )
 
 
-def _apply_instructions(subdir, repodata, instructions):
+def _apply_instructions(subdir, repodata, instructions, new_pkg_fixes=None):
     repodata.setdefault("removed", [])
+    # apply to .tar.bz2 packages
     utils.merge_or_update_dict(
         repodata.get("packages", {}),
         instructions.get("packages", {}),
         merge=False,
         add_missing_keys=False,
     )
-    # we could have totally separate instructions for .conda than .tar.bz2, but it's easier if we assume
-    #    that a similarly-named .tar.bz2 file is the same content as .conda, and shares fixes
-    new_pkg_fixes = {
-        k.replace(CONDA_PACKAGE_EXTENSION_V1, CONDA_PACKAGE_EXTENSION_V2): v
-        for k, v in instructions.get("packages", {}).items()
-    }
 
+    if new_pkg_fixes is None:
+        # we could have totally separate instructions for .conda than .tar.bz2, but it's easier if we assume
+        #    that a similarly-named .tar.bz2 file is the same content as .conda, and shares fixes
+        new_pkg_fixes = {
+            k.replace(CONDA_PACKAGE_EXTENSION_V1, CONDA_PACKAGE_EXTENSION_V2): v
+            for k, v in instructions.get("packages", {}).items()
+        }
+
+    # apply .tar.bz2 fixes to packages.conda
     utils.merge_or_update_dict(
         repodata.get("packages.conda", {}),
         new_pkg_fixes,
         merge=False,
         add_missing_keys=False,
     )
+    # apply .conda-only fixes to packages.conda
     utils.merge_or_update_dict(
         repodata.get("packages.conda", {}),
         instructions.get("packages.conda", {}),
@@ -481,6 +486,7 @@ class ChannelIndex:
     :param channel_url: fsspec URL where package files live. If provided, channel_root will only be used for cache and index output.
     :param fs: ``MinimalFS`` instance to be used with channel_url. Wrap fsspec AbstractFileSystem with ``conda_index.index.fs.FsspecFS(fs)``.
     :param base_url: Add ``base_url/<subdir>`` to repodata.json to be able to host packages separate from repodata.json
+    :param save_fs_state: Pass False to use cached filesystem state instead of ``os.listdir(subdir)``
     """
 
     fs: MinimalFS | None = None
@@ -504,7 +510,9 @@ def __init__(
         channel_url: str | None = None,
         fs: MinimalFS | None = None,
         base_url: str | None = None,
+        save_fs_state=True,
         write_current_repodata=True,
+        upstream_stage: str = "fs",
     ):
         if threads is None:
             threads = MAX_THREADS_DEFAULT
@@ -531,7 +539,9 @@ def __init__(
         self.write_run_exports = write_run_exports
         self.compact_json = compact_json
         self.base_url = base_url
+        self.save_fs_state = save_fs_state
         self.write_current_repodata = write_current_repodata
+        self.upstream_stage = upstream_stage
 
     def index(
         self,
@@ -571,6 +581,10 @@ def extract_wrapper(args: tuple):
                     # runs in thread
                     subdir, verbose, progress, subdir_path = args
                     cache = self.cache_for_subdir(subdir)
+                    # exactly these packages (unless they are un-indexable) will
+                    # be in the output repodata
+                    if self.save_fs_state:
+                        cache.save_fs_state(subdir_path)
                     return self.extract_subdir_to_cache(
                         subdir, verbose, progress, subdir_path, cache
                     )
@@ -768,24 +782,26 @@ def cache_for_subdir(self, subdir):
             subdir=subdir,
             fs=self.fs,
             channel_url=self.channel_url,
+            upstream_stage=self.upstream_stage,
         )
         if cache.cache_is_brand_new:
             # guaranteed to be only thread doing this?
             cache.convert()
         return cache
 
     def extract_subdir_to_cache(
-        self, subdir, verbose, progress, subdir_path, cache: sqlitecache.CondaIndexCache
-    ):
+        self,
+        subdir: str,
+        verbose,
+        progress,
+        subdir_path,
+        cache: sqlitecache.CondaIndexCache,
+    ) -> str:
         """
         Extract all changed packages into the subdir cache.
 
         Return name of subdir.
         """
-        # exactly these packages (unless they are un-indexable) will be in the
-        # output repodata
-        cache.save_fs_state(subdir_path)
-
         log.debug("%s find packages to extract", subdir)
 
         # list so tqdm can show progress
@@ -1242,6 +1258,8 @@ def _maybe_write_output_paths(
             newline = b"\n"
             newline_option = None
 
+        # XXX could we avoid writing output_temp_path in some cases?
+
         # always use \n line separator
         with open(
             output_temp_path,

diff --git a/conda_index/index/convert_cache.py b/conda_index/index/convert_cache.py
@@ -20,7 +20,7 @@
 log = logging.getLogger(__name__)
 
 # maximum 'PRAGMA user_version' we support
-USER_VERSION = 1
+USER_VERSION = 2
 
 PATH_INFO = re.compile(
     r"""
@@ -72,7 +72,13 @@ def create(conn):
     # has md5, shasum. older? packages do not include timestamp?
     # SELECT path, datetime(json_extract(index_json, '$.timestamp'), 'unixepoch'), index_json from index_json
     conn.execute(
-        "CREATE TABLE IF NOT EXISTS index_json (path TEXT PRIMARY KEY, index_json BLOB)"
+        """
+        CREATE TABLE IF NOT EXISTS index_json (
+            path TEXT PRIMARY KEY, index_json BLOB,
+            name AS (json_extract(index_json, '$.name')),
+            sha256 AS (json_extract(index_json, '$.sha256'))
+        )
+        """
     )
     conn.execute(
         "CREATE TABLE IF NOT EXISTS recipe (path TEXT PRIMARY KEY, recipe BLOB)"
@@ -126,13 +132,14 @@ def migrate(conn):
             "conda-index cache is too new: version {user_version} > {USER_VERSION}"
         )
 
-    if user_version > 0:
-        return
-
-    remove_prefix(conn)
+    if user_version < 1:
+        remove_prefix(conn)
+        # PRAGMA can't accept ?-substitution
+        conn.execute("PRAGMA user_version=1")
 
-    # PRAGMA can't accept ?-substitution
-    conn.execute("PRAGMA user_version=1")
+    if user_version < 2:
+        add_computed_name(conn)
+        conn.execute("PRAGMA user_version=2")
 
 
 def remove_prefix(conn: sqlite3.Connection):
@@ -160,6 +167,21 @@ def basename(path):
         )
 
 
+def add_computed_name(db: sqlite3.Connection):
+    """
+    Add helpful computed columns to index_json.
+    """
+    columns = set(row[1] for row in db.execute("PRAGMA table_xinfo(index_json)"))
+    if "name" not in columns:
+        db.execute(
+            "ALTER TABLE index_json ADD COLUMN name AS (json_extract(index_json, '$.name'))"
+        )
+    if "sha256" not in columns:
+        db.execute(
+            "ALTER TABLE index_json ADD COLUMN sha256 AS (json_extract(index_json, '$.sha256'))"
+        )
+
+
 def extract_cache_filesystem(path):
     """
     Yield interesting (match, <bytes>) members of filesystem at path.