Merge pull request #259 from medema-group/release/2.0.0-beta.5

Release/2.0.0 beta.5
medema-group · Feb 12, 2025 · 1d2792e · 1d2792e
2 parents da5614a + a5ab8ac
commit 1d2792e
Show file tree

Hide file tree

Showing 29 changed files with 370 additions and 136 deletions.
diff --git a/big_scape/benchmarking/benchmark_data_loader.py b/big_scape/benchmarking/benchmark_data_loader.py
@@ -13,7 +13,7 @@ class BenchmarkData:
 
     Atrributes:
         curated_path (Path): location of curated GCF assignments file
-        in_path (Path): location of BiG-SCAPE output database
+        in_path (Path): location of BiG-SCAPE/BiG-SLICE output folder
     """
 
     def __init__(self, curated_path: Path, in_path: Path) -> None:
@@ -23,7 +23,15 @@ def __init__(self, curated_path: Path, in_path: Path) -> None:
     def read_gcf_short_tsv(self, infile: TextIO) -> dict[str, str]:
         """Read GCF assignments tsv file short format
 
-        Expects tsv file with BGC name and family per line and stores in {bgc_name: fam}
+        Expects tsv file with BGC name and family per line excluding a header, and
+        stores in {bgc_name: fam}. Regions should be named after their GBK (excluding
+        .gbk extension), while other record types should be formatted as
+        <GBK>.gbk_<recordtype>_<recordnumber>.
+
+        Example:
+            BGC	GCF
+            JK1.region010	NRPS_1
+            JK1.region05.gbk_protocluster_1 NRPS_2
 
         Args:
             infile (TextIO): opened tsv file object
@@ -40,27 +48,31 @@ def read_gcf_short_tsv(self, infile: TextIO) -> dict[str, str]:
     def read_gcf_long_tsv(self, infile: TextIO) -> dict[str, str]:
         """Read GCF assignment tsv file long format
 
-        Expects tsv file with BGC name, record type, record number and family per line.
-        Stores record information as BGC_record_number, if record type is region ignores
-        region number.
+        Expects tsv file with Record, GBK, record type, record number, CC and family
+        per line excluding a header line. Either stores BGC name as the GBK in case of
+        region record, or as <GBK>.gbk_<recordtype>_<recordnumber>.
+
+        Example:
+            Record	GBK	Record_Type	Record_Number	CC	Family
+            JK1.region05.gbk_region_1	JK1.region05	region	1	1	FAM_00005
 
         Args:
-            filename (TextIO): opened tsv file object
+            infile (TextIO): opened tsv file object
 
         Returns:
             dict[str, str]: dictionary linking BGC record to family
         """
         data = {}
         for line in infile:
             parts = line.strip().split("\t")
-            clean_name = Path(parts[0]).name.replace(".gbk", "")
+            clean_name = Path(parts[1]).name.replace(".gbk", "")
 
-            if parts[1] == "region":
+            if parts[2] == "region":
                 bgc = clean_name
             else:
-                bgc = f"{clean_name}_{parts[1]}_{parts[2]}"
+                bgc = f"{clean_name}.gbk_{parts[2]}_{parts[3]}"
 
-            data[bgc] = parts[4].replace("FAM_", "")
+            data[bgc] = parts[5].replace("FAM_", "")
         return data
 
     def load_curated_labels(self) -> None:

diff --git a/big_scape/cli/cli_common_options.py b/big_scape/cli/cli_common_options.py
@@ -109,7 +109,11 @@ def common_cluster_query(fn):
                 exists=True, file_okay=False, dir_okay=True, path_type=Path
             ),
             required=True,
-            help="Input directory containing .gbk files to be used by BiG-SCAPE. See the wiki for more details.",
+            help=(
+                "Input directory containing .gbk files to be used by BiG-SCAPE. "
+                "Duplicated filenames can be handled, but are not recommended. "
+                "See the wiki for more details."
+            ),
         ),
         click.option(
             "--config-file-path",
@@ -139,6 +143,7 @@ def common_cluster_query(fn):
             help=(
                 "Tells BiG-SCAPE where to look for input GBK files. "
                 "recursive: search for .gbk files recursively in input directory. "
+                "Duplicated filenames are not recommended. "
                 "flat: search for .gbk files in input directory only. "
                 "(default: recursive)."
             ),
@@ -168,7 +173,8 @@ def common_cluster_query(fn):
                 exists=True, file_okay=False, dir_okay=True, path_type=Path
             ),
             help=(
-                "Path to directory containing user defined, non-MIBiG, antiSMASH processed reference BGCs."
+                "Path to directory containing user defined, non-MIBiG, antiSMASH processed reference BGCs. "
+                "Duplicated filenames are not recommended. "
                 "For more information, see the wiki."
             ),
         ),

diff --git a/big_scape/comparison/__init__.py b/big_scape/comparison/__init__.py
@@ -11,7 +11,6 @@
     legacy_get_class,
     as_class_bin_generator,
     get_legacy_weights_from_category,
-    get_record_category,
 )
 from .comparable_region import ComparableRegion
 from .workflow import generate_edges
@@ -41,7 +40,6 @@
     "legacy_get_class",
     "as_class_bin_generator",
     "get_legacy_weights_from_category",
-    "get_record_category",
     "save_edge_to_db",
     "save_edges_to_db",
     "lcs",

diff --git a/big_scape/comparison/binning.py b/big_scape/comparison/binning.py
@@ -23,7 +23,6 @@
 from big_scape.genbank import (
     BGCRecord,
     Region,
-    CandidateCluster,
     ProtoCluster,
     ProtoCore,
 )
@@ -737,7 +736,7 @@ def as_class_bin_generator(
             record_class = record.product
 
         if classify_mode == CLASSIFY_MODE.CATEGORY:
-            record_class = get_record_category(record)
+            record_class = record.get_category()
 
         if run["hybrids_off"]:
             record_classes = record_class.split(".")
@@ -773,52 +772,6 @@ def as_class_bin_generator(
         yield bin
 
 
-def get_record_category(record: BGCRecord) -> str:
-    """Get the category of a BGC based on its antiSMASH product(s)
-
-    Args:
-        region (Region): region object
-
-    Returns:
-        str: BGC category
-    """
-
-    def cand_cluster_category(cand_cluster: CandidateCluster) -> set[str]:
-        """Grab categories that occur in a candidate cluster"""
-        categories = set()
-        for proto in cand_cluster.proto_clusters.values():
-            if proto is not None:
-                categories.update(proto_category(proto))
-        return categories
-
-    def proto_category(proto: ProtoCluster | ProtoCore) -> set[str]:
-        """Grab category(s) that occurs in Protocluster or Protocore"""
-        # merged protocluster/cores can contain multiple categories joined by "."
-        return set(proto.category.split(".") if proto.category is not None else [])
-
-    categories: set[str] = set()
-
-    if isinstance(record, Region):
-        # get categories from region object
-        for cand_cluster in record.cand_clusters.values():
-            if cand_cluster is not None:
-                categories.update(cand_cluster_category(cand_cluster))
-
-    if isinstance(record, CandidateCluster):
-        categories.update(cand_cluster_category(record))
-
-    if isinstance(record, ProtoCluster) or isinstance(record, ProtoCore):
-        categories.update(proto_category(record))
-
-    if len(categories) == 0:
-        return "Categoryless"
-
-    if len(categories) == 1:
-        return list(categories)[0]
-
-    return ".".join(categories)
-
-
 def get_legacy_weights_from_category(
     record: BGCRecord, record_class: str, run: dict
 ) -> str:

diff --git a/big_scape/distances/classify.py b/big_scape/distances/classify.py
@@ -66,7 +66,7 @@ def callback(edges):
                     callback,
                 )
 
-            bs_comparison.save_edges_to_db(save_batch)
+            bs_comparison.save_edges_to_db(save_batch, commit=True)
 
             bs_data.DB.commit()
 

diff --git a/big_scape/distances/legacy_classify.py b/big_scape/distances/legacy_classify.py
@@ -65,7 +65,7 @@ def callback(edges):
                     callback,
                 )
 
-            bs_comparison.save_edges_to_db(save_batch)
+            bs_comparison.save_edges_to_db(save_batch, commit=True)
 
             bs_data.DB.commit()
 

diff --git a/big_scape/distances/mix.py b/big_scape/distances/mix.py
@@ -62,8 +62,7 @@ def callback(edges):
                 run["cores"] * 2,
                 callback,
             )
-
-        bs_comparison.save_edges_to_db(save_batch)
+        bs_comparison.save_edges_to_db(save_batch, commit=True)
 
         bs_data.DB.commit()
 

diff --git a/big_scape/distances/query.py b/big_scape/distances/query.py
@@ -82,7 +82,9 @@ def calculate_distances_query(
     return query_bin_connected
 
 
-def get_query_records(run, all_bgc_records, query_record) -> list[bs_gbk.BGCRecord]:
+def get_query_records(
+    run: dict, all_bgc_records: list[bs_gbk.BGCRecord], query_record: bs_gbk.BGCRecord
+) -> list[bs_gbk.BGCRecord]:
     """returns the query records and checks if the query is a singleton
 
     Args:
@@ -130,8 +132,8 @@ def get_query_records(run, all_bgc_records, query_record) -> list[bs_gbk.BGCReco
                     query_records.append(record)
 
             if classify_mode == bs_enums.CLASSIFY_MODE.CATEGORY:
-                query_category = [bs_comparison.get_record_category(query_record)]
-                record_category = [bs_comparison.get_record_category(record)]
+                query_category = [query_record.get_category()]
+                record_category = [record.get_category()]
 
                 intersect_cats = list(set(query_category) & set(record_category))
                 if len(intersect_cats) > 0:
@@ -197,7 +199,7 @@ def callback(edges):
                     callback,
                 )
 
-            bs_comparison.save_edges_to_db(save_batch)
+            bs_comparison.save_edges_to_db(save_batch, commit=True)
 
             bs_data.DB.commit()
 

diff --git a/big_scape/file_input/load_files.py b/big_scape/file_input/load_files.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import List, Optional
 import os
+import re
 import glob
 import tarfile
 import multiprocessing
@@ -35,11 +36,7 @@ def get_mibig(mibig_version: str, bigscape_dir: Path):
         Path: path to MIBiG database (antismash processed gbks)
     """
 
-    mibig_url_base = "https://dl.secondarymetabolites.org/mibig/"
-    mibig_url = mibig_url_base + f"mibig_antismash_{mibig_version}_gbk.tar.bz2"
-    # TODO: this only works for 3.1, update to proper link once Kai makes it available
-    # https://dl.secondarymetabolites.org/mibig/mibig_antismash_3.1_gbk.tar.bz2
-    # https://dl.secondarymetabolites.org/mibig/mibig_antismash_3.1_json.tar.bz2
+    mibig_url = find_mibig_version_url(mibig_version)
 
     mibig_dir = Path(os.path.join(bigscape_dir, "MIBiG"))
     mibig_version_dir = Path(
@@ -62,6 +59,37 @@ def get_mibig(mibig_version: str, bigscape_dir: Path):
     return mibig_version_dir
 
 
+def find_mibig_version_url(mibig_version: str):
+    """Scrape the MIBiG downloads page for a link to a specific MIBiG version
+
+    Args:
+        mibig_version (str): requested MIBiG version
+
+    Raises:
+        ValueError: No download link found for specified version
+
+    Returns:
+        str: url to download a specified MIBiG version
+    """
+    # scrape the downloads page to find the download link to the requested MIBiG version
+    mibig_url_base = "https://dl.secondarymetabolites.org/mibig/"
+    dl_page = requests.get(mibig_url_base)
+
+    if dl_page.status_code != 200:
+        return RuntimeError("MIBiG Downloads page could not be reached")
+
+    # file pattern follows mibig_antismash_<version>_gbk[_<as_version>].tar.bz2
+    # [_<as_version>] being optional: present for 4.0, absent for 3.1
+    version_match = re.search(
+        f"mibig_antismash_{re.escape(mibig_version)}_gbk.*?\.tar\.bz2", dl_page.text
+    )
+
+    if not version_match:
+        raise ValueError(f"MIBiG version {mibig_version} was not found")
+
+    return mibig_url_base + version_match.group()
+
+
 def download_dataset(url: str, path: Path, path_compressed: Path) -> None:
     # pragma: no cover
     """A function to download and decompress a dataset from an online repository
@@ -94,6 +122,10 @@ def download_dataset(url: str, path: Path, path_compressed: Path) -> None:
         # TODO: deal with deprecation
         file.extractall(path)
 
+    # make sure directory naming is consistent
+    src = path / url.replace(".tar.bz2", "").split("/")[-1]
+    dst = str(path_compressed).replace(".tar.bz2", "")
+    os.rename(src, dst)
     os.remove(path_compressed)
 
 

diff --git a/big_scape/genbank/bgc_record.py b/big_scape/genbank/bgc_record.py
@@ -53,6 +53,7 @@ def __init__(
         nt_stop: int,
         contig_edge: Optional[bool],
         product: str,
+        category: Optional[str] = None,
     ):
         self.parent_gbk = parent_gbk
         self.number = number
@@ -61,6 +62,7 @@ def __init__(
         self.nt_start = nt_start
         self.nt_stop = nt_stop
         self.product = product
+        self.category = category
         self.merged: bool = False
 
         # for database operations
@@ -190,6 +192,42 @@ def get_cds_start_stop(self) -> tuple[int, int]:
                 break
         return record_start, record_stop
 
+    def set_record_category(self) -> None:
+        """Sets the category of a record to categories in this record and its subrecords
+
+        Relevant for regions and candidate clusters. Protocluster and protocore category
+        is set during parsing of their feature, since they have only one category.
+        """
+        # obtain categories present in any sub-records
+        categories = self.get_categories()
+
+        if len(categories) == 0:
+            # if no categories given, category remains None
+            return
+        elif len(categories) == 1:
+            category = list(categories)[0]
+        else:
+            category = ".".join(sorted(categories))
+        self.category = category
+
+    def get_categories(self) -> set[str]:
+        """Obtain a set of unique categories associated with this record
+
+        Semi-template method, overwritten by Region and CandidateCluster classes.
+        Returns an empty set if no categories are associated.
+        """
+        return set(self.category.split(".") if self.category is not None else [])
+
+    def get_category(self) -> str:
+        """Returns the category of a record or 'Categoryless' if category is None
+
+        Should be used instead of directly accessing the category attribute
+        to ensure compatibility with as5 and below, as well as --force-gbk
+        """
+        if self.category is not None:
+            return self.category
+        return "Categoryless"
+
     def save_record(
         self, record_type: str, parent_id: Optional[int] = None, commit=True
     ) -> None: