Skip to content

Commit

Permalink
Merge pull request #259 from medema-group/release/2.0.0-beta.5
Browse files Browse the repository at this point in the history
Release/2.0.0 beta.5
  • Loading branch information
adraismawur authored Feb 12, 2025
2 parents da5614a + a5ab8ac commit 1d2792e
Show file tree
Hide file tree
Showing 29 changed files with 370 additions and 136 deletions.
32 changes: 22 additions & 10 deletions big_scape/benchmarking/benchmark_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class BenchmarkData:
Atrributes:
curated_path (Path): location of curated GCF assignments file
in_path (Path): location of BiG-SCAPE output database
in_path (Path): location of BiG-SCAPE/BiG-SLICE output folder
"""

def __init__(self, curated_path: Path, in_path: Path) -> None:
Expand All @@ -23,7 +23,15 @@ def __init__(self, curated_path: Path, in_path: Path) -> None:
def read_gcf_short_tsv(self, infile: TextIO) -> dict[str, str]:
"""Read GCF assignments tsv file short format
Expects tsv file with BGC name and family per line and stores in {bgc_name: fam}
Expects tsv file with BGC name and family per line excluding a header, and
stores in {bgc_name: fam}. Regions should be named after their GBK (excluding
.gbk extension), while other record types should be formatted as
<GBK>.gbk_<recordtype>_<recordnumber>.
Example:
BGC GCF
JK1.region010 NRPS_1
JK1.region05.gbk_protocluster_1 NRPS_2
Args:
infile (TextIO): opened tsv file object
Expand All @@ -40,27 +48,31 @@ def read_gcf_short_tsv(self, infile: TextIO) -> dict[str, str]:
def read_gcf_long_tsv(self, infile: TextIO) -> dict[str, str]:
"""Read GCF assignment tsv file long format
Expects tsv file with BGC name, record type, record number and family per line.
Stores record information as BGC_record_number, if record type is region ignores
region number.
Expects tsv file with Record, GBK, record type, record number, CC and family
per line excluding a header line. Either stores BGC name as the GBK in case of
region record, or as <GBK>.gbk_<recordtype>_<recordnumber>.
Example:
Record GBK Record_Type Record_Number CC Family
JK1.region05.gbk_region_1 JK1.region05 region 1 1 FAM_00005
Args:
filename (TextIO): opened tsv file object
infile (TextIO): opened tsv file object
Returns:
dict[str, str]: dictionary linking BGC record to family
"""
data = {}
for line in infile:
parts = line.strip().split("\t")
clean_name = Path(parts[0]).name.replace(".gbk", "")
clean_name = Path(parts[1]).name.replace(".gbk", "")

if parts[1] == "region":
if parts[2] == "region":
bgc = clean_name
else:
bgc = f"{clean_name}_{parts[1]}_{parts[2]}"
bgc = f"{clean_name}.gbk_{parts[2]}_{parts[3]}"

data[bgc] = parts[4].replace("FAM_", "")
data[bgc] = parts[5].replace("FAM_", "")
return data

def load_curated_labels(self) -> None:
Expand Down
10 changes: 8 additions & 2 deletions big_scape/cli/cli_common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,11 @@ def common_cluster_query(fn):
exists=True, file_okay=False, dir_okay=True, path_type=Path
),
required=True,
help="Input directory containing .gbk files to be used by BiG-SCAPE. See the wiki for more details.",
help=(
"Input directory containing .gbk files to be used by BiG-SCAPE. "
"Duplicated filenames can be handled, but are not recommended. "
"See the wiki for more details."
),
),
click.option(
"--config-file-path",
Expand Down Expand Up @@ -139,6 +143,7 @@ def common_cluster_query(fn):
help=(
"Tells BiG-SCAPE where to look for input GBK files. "
"recursive: search for .gbk files recursively in input directory. "
"Duplicated filenames are not recommended. "
"flat: search for .gbk files in input directory only. "
"(default: recursive)."
),
Expand Down Expand Up @@ -168,7 +173,8 @@ def common_cluster_query(fn):
exists=True, file_okay=False, dir_okay=True, path_type=Path
),
help=(
"Path to directory containing user defined, non-MIBiG, antiSMASH processed reference BGCs."
"Path to directory containing user defined, non-MIBiG, antiSMASH processed reference BGCs. "
"Duplicated filenames are not recommended. "
"For more information, see the wiki."
),
),
Expand Down
2 changes: 0 additions & 2 deletions big_scape/comparison/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
legacy_get_class,
as_class_bin_generator,
get_legacy_weights_from_category,
get_record_category,
)
from .comparable_region import ComparableRegion
from .workflow import generate_edges
Expand Down Expand Up @@ -41,7 +40,6 @@
"legacy_get_class",
"as_class_bin_generator",
"get_legacy_weights_from_category",
"get_record_category",
"save_edge_to_db",
"save_edges_to_db",
"lcs",
Expand Down
49 changes: 1 addition & 48 deletions big_scape/comparison/binning.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from big_scape.genbank import (
BGCRecord,
Region,
CandidateCluster,
ProtoCluster,
ProtoCore,
)
Expand Down Expand Up @@ -737,7 +736,7 @@ def as_class_bin_generator(
record_class = record.product

if classify_mode == CLASSIFY_MODE.CATEGORY:
record_class = get_record_category(record)
record_class = record.get_category()

if run["hybrids_off"]:
record_classes = record_class.split(".")
Expand Down Expand Up @@ -773,52 +772,6 @@ def as_class_bin_generator(
yield bin


def get_record_category(record: BGCRecord) -> str:
"""Get the category of a BGC based on its antiSMASH product(s)
Args:
region (Region): region object
Returns:
str: BGC category
"""

def cand_cluster_category(cand_cluster: CandidateCluster) -> set[str]:
"""Grab categories that occur in a candidate cluster"""
categories = set()
for proto in cand_cluster.proto_clusters.values():
if proto is not None:
categories.update(proto_category(proto))
return categories

def proto_category(proto: ProtoCluster | ProtoCore) -> set[str]:
"""Grab category(s) that occurs in Protocluster or Protocore"""
# merged protocluster/cores can contain multiple categories joined by "."
return set(proto.category.split(".") if proto.category is not None else [])

categories: set[str] = set()

if isinstance(record, Region):
# get categories from region object
for cand_cluster in record.cand_clusters.values():
if cand_cluster is not None:
categories.update(cand_cluster_category(cand_cluster))

if isinstance(record, CandidateCluster):
categories.update(cand_cluster_category(record))

if isinstance(record, ProtoCluster) or isinstance(record, ProtoCore):
categories.update(proto_category(record))

if len(categories) == 0:
return "Categoryless"

if len(categories) == 1:
return list(categories)[0]

return ".".join(categories)


def get_legacy_weights_from_category(
record: BGCRecord, record_class: str, run: dict
) -> str:
Expand Down
2 changes: 1 addition & 1 deletion big_scape/distances/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def callback(edges):
callback,
)

bs_comparison.save_edges_to_db(save_batch)
bs_comparison.save_edges_to_db(save_batch, commit=True)

bs_data.DB.commit()

Expand Down
2 changes: 1 addition & 1 deletion big_scape/distances/legacy_classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def callback(edges):
callback,
)

bs_comparison.save_edges_to_db(save_batch)
bs_comparison.save_edges_to_db(save_batch, commit=True)

bs_data.DB.commit()

Expand Down
3 changes: 1 addition & 2 deletions big_scape/distances/mix.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,7 @@ def callback(edges):
run["cores"] * 2,
callback,
)

bs_comparison.save_edges_to_db(save_batch)
bs_comparison.save_edges_to_db(save_batch, commit=True)

bs_data.DB.commit()

Expand Down
10 changes: 6 additions & 4 deletions big_scape/distances/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ def calculate_distances_query(
return query_bin_connected


def get_query_records(run, all_bgc_records, query_record) -> list[bs_gbk.BGCRecord]:
def get_query_records(
run: dict, all_bgc_records: list[bs_gbk.BGCRecord], query_record: bs_gbk.BGCRecord
) -> list[bs_gbk.BGCRecord]:
"""returns the query records and checks if the query is a singleton
Args:
Expand Down Expand Up @@ -130,8 +132,8 @@ def get_query_records(run, all_bgc_records, query_record) -> list[bs_gbk.BGCReco
query_records.append(record)

if classify_mode == bs_enums.CLASSIFY_MODE.CATEGORY:
query_category = [bs_comparison.get_record_category(query_record)]
record_category = [bs_comparison.get_record_category(record)]
query_category = [query_record.get_category()]
record_category = [record.get_category()]

intersect_cats = list(set(query_category) & set(record_category))
if len(intersect_cats) > 0:
Expand Down Expand Up @@ -197,7 +199,7 @@ def callback(edges):
callback,
)

bs_comparison.save_edges_to_db(save_batch)
bs_comparison.save_edges_to_db(save_batch, commit=True)

bs_data.DB.commit()

Expand Down
42 changes: 37 additions & 5 deletions big_scape/file_input/load_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from typing import List, Optional
import os
import re
import glob
import tarfile
import multiprocessing
Expand Down Expand Up @@ -35,11 +36,7 @@ def get_mibig(mibig_version: str, bigscape_dir: Path):
Path: path to MIBiG database (antismash processed gbks)
"""

mibig_url_base = "https://dl.secondarymetabolites.org/mibig/"
mibig_url = mibig_url_base + f"mibig_antismash_{mibig_version}_gbk.tar.bz2"
# TODO: this only works for 3.1, update to proper link once Kai makes it available
# https://dl.secondarymetabolites.org/mibig/mibig_antismash_3.1_gbk.tar.bz2
# https://dl.secondarymetabolites.org/mibig/mibig_antismash_3.1_json.tar.bz2
mibig_url = find_mibig_version_url(mibig_version)

mibig_dir = Path(os.path.join(bigscape_dir, "MIBiG"))
mibig_version_dir = Path(
Expand All @@ -62,6 +59,37 @@ def get_mibig(mibig_version: str, bigscape_dir: Path):
return mibig_version_dir


def find_mibig_version_url(mibig_version: str):
"""Scrape the MIBiG downloads page for a link to a specific MIBiG version
Args:
mibig_version (str): requested MIBiG version
Raises:
ValueError: No download link found for specified version
Returns:
str: url to download a specified MIBiG version
"""
# scrape the downloads page to find the download link to the requested MIBiG version
mibig_url_base = "https://dl.secondarymetabolites.org/mibig/"
dl_page = requests.get(mibig_url_base)

if dl_page.status_code != 200:
return RuntimeError("MIBiG Downloads page could not be reached")

# file pattern follows mibig_antismash_<version>_gbk[_<as_version>].tar.bz2
# [_<as_version>] being optional: present for 4.0, absent for 3.1
version_match = re.search(
f"mibig_antismash_{re.escape(mibig_version)}_gbk.*?\.tar\.bz2", dl_page.text
)

if not version_match:
raise ValueError(f"MIBiG version {mibig_version} was not found")

return mibig_url_base + version_match.group()


def download_dataset(url: str, path: Path, path_compressed: Path) -> None:
# pragma: no cover
"""A function to download and decompress a dataset from an online repository
Expand Down Expand Up @@ -94,6 +122,10 @@ def download_dataset(url: str, path: Path, path_compressed: Path) -> None:
# TODO: deal with deprecation
file.extractall(path)

# make sure directory naming is consistent
src = path / url.replace(".tar.bz2", "").split("/")[-1]
dst = str(path_compressed).replace(".tar.bz2", "")
os.rename(src, dst)
os.remove(path_compressed)


Expand Down
38 changes: 38 additions & 0 deletions big_scape/genbank/bgc_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
nt_stop: int,
contig_edge: Optional[bool],
product: str,
category: Optional[str] = None,
):
self.parent_gbk = parent_gbk
self.number = number
Expand All @@ -61,6 +62,7 @@ def __init__(
self.nt_start = nt_start
self.nt_stop = nt_stop
self.product = product
self.category = category
self.merged: bool = False

# for database operations
Expand Down Expand Up @@ -190,6 +192,42 @@ def get_cds_start_stop(self) -> tuple[int, int]:
break
return record_start, record_stop

def set_record_category(self) -> None:
"""Sets the category of a record to categories in this record and its subrecords
Relevant for regions and candidate clusters. Protocluster and protocore category
is set during parsing of their feature, since they have only one category.
"""
# obtain categories present in any sub-records
categories = self.get_categories()

if len(categories) == 0:
# if no categories given, category remains None
return
elif len(categories) == 1:
category = list(categories)[0]
else:
category = ".".join(sorted(categories))
self.category = category

def get_categories(self) -> set[str]:
"""Obtain a set of unique categories associated with this record
Semi-template method, overwritten by Region and CandidateCluster classes.
Returns an empty set if no categories are associated.
"""
return set(self.category.split(".") if self.category is not None else [])

def get_category(self) -> str:
"""Returns the category of a record or 'Categoryless' if category is None
Should be used instead of directly accessing the category attribute
to ensure compatibility with as5 and below, as well as --force-gbk
"""
if self.category is not None:
return self.category
return "Categoryless"

def save_record(
self, record_type: str, parent_id: Optional[int] = None, commit=True
) -> None:
Expand Down
Loading

0 comments on commit 1d2792e

Please sign in to comment.