Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor/create package #1

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -123,3 +123,5 @@ dmypy.json
.pyre/

# End of https://www.gitignore.io/api/python

src/caduceus/_version.py
2 changes: 1 addition & 1 deletion configs/model/caduceus.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Use open-source version of Mamba
_name_: caduceus_lm
config:
_target_: caduceus.configuration_caduceus.CaduceusConfig
_target_: caduceus.huggingface.configuration_caduceus.CaduceusConfig
# From original MambaConfig
d_model: 128
n_layer: 2
Expand Down
84 changes: 84 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
[build-system]
requires = ["setuptools>=48", "setuptools_scm[toml]>=6.3.1"]
build-backend = "setuptools.build_meta"

[tool.setuptools_scm]
write_to = "src/caduceus/_version.py"
version_scheme = "post-release"
fallback_version = "0.0.0"

[tool.black]
line-length = 88
include = '\.pyi?$'
exclude = '''
/(
\.eggs
| \.git
| \.hg
| \.mypy_cache
| \.tox
| \.venv
| _build
| buck-out
| build
| dist
)/
'''

[tool.isort]
profile = "black"
known_first_party = ["caduceus"]
line_length = 88

[tool.pytest.ini_options]
addopts = "-ra"

[tool.coverage.run]
branch = true
source = ["src/caduceus"]

[tool.coverage.paths]
source = ["src/caduceus", "*/site-packages"]

[tool.coverage.report]
show_missing = true
exclude_lines = [
"pragma: no cover",
"if __name__ == .__main__.:",
"if typing.TYPE_CHECKING:",
"if TYPE_CHECKING:",
"raise NotImplementedError",
"raise AssertionError",
"@overload",
]

[tool.mypy]
# Error output
show_column_numbers = true
show_error_codes = true
show_error_context = true
show_traceback = true
pretty = true
check_untyped_defs = false
# Warnings
warn_no_return = true
warn_redundant_casts = true
warn_unreachable = true
files = ["src/caduceus", "tests"]

[tool.pylint.format]
max-line-length = 88

[tool.pylint.message_control]
enable = ["c-extension-no-member", "no-else-return"]

[tool.pylint.variables]
dummy-variables-rgx = "_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_"
ignored-argument-names = "_.*|^ignored_|^unused_|args|kwargs"

[tool.codespell]
ignore-words-list = " "

[tool.bandit]
exclude_dirs = ["tests"]
skips = ["B101"]
54 changes: 54 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
[metadata]
description = Bi-Directional Equivariant Long-Range DNA Sequence Modeling
name = caduceus
long_description = file: README.md
long_description_content_type = text/markdown
url = https://github.com/kuleshov-group/caduceus
platforms=any
authors = Yair Schiff
classifiers =
Programming Language :: Python :: 3
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Programming Language :: Python :: 3.11
Programming Language :: Python :: 3.12

[options]
python_requires = >=3.8
zip_safe = False
package_dir=
=src
packages = find:
install_requires=
biopython
causal-conv1d
datasets
einops
genomic-benchmarks
huggingface-hub
hydra-core
mamba-ssm
omegaconf
lightning[extra]
rich
scikit-learn
timm
torchmetrics
transformers
triton
pyfaidx
pandas

[options.extras_require]
tests =
pytest
dev =
%(tests)s
pre-commit

[options.packages.find]
exclude =
tests
tests.*
where=src
13 changes: 13 additions & 0 deletions src/caduceus/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Caduceus"""

import warnings

try:
from caduceus._version import __version__
except ImportError:
__version__ = "not-installed"
warnings.warn(
"You are running a non-installed version caduceus."
"If you are running this from a git repo, please run"
"`pip install -e .` to install the package."
)
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

# Default data path is environment variable or <repo_root_dir>/data
if (default_data_path := os.getenv("DATA_PATH")) is None:
default_data_path = Path(__file__).parent.parent.parent.absolute()
default_data_path = Path(__file__).parent.parent.parent.parent.absolute()
default_data_path = default_data_path / "data"
else:
default_data_path = Path(default_data_path).absolute()
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from genomic_benchmarks.data_check import is_downloaded
from genomic_benchmarks.loc2seq import download_dataset

from src.dataloaders.utils.rc import coin_flip, string_reverse_complement
from caduceus.dataloaders.utils.rc import coin_flip, string_reverse_complement


class GenomicBenchmarkDataset(torch.utils.data.Dataset):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
import torch
from pyfaidx import Fasta

from src.dataloaders.utils.mlm import mlm_getitem
from src.dataloaders.utils.rc import coin_flip, string_reverse_complement
from caduceus.dataloaders.utils.mlm import mlm_getitem
from caduceus.dataloaders.utils.rc import coin_flip, string_reverse_complement

MAX_ALLOWED_LENGTH = 2 ** 20

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import torch
from datasets import load_dataset

from src.dataloaders.utils.rc import coin_flip, string_reverse_complement
from caduceus.dataloaders.utils.rc import coin_flip, string_reverse_complement


class NucleotideTransformerDataset(torch.utils.data.Dataset):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@
from datasets import Dataset
from torch.utils.data.dataloader import DataLoader

from caduceus.tokenization_caduceus import CaduceusTokenizer
import src.utils.train
from src.dataloaders.base import SequenceDataset, default_data_path
from src.dataloaders.datasets.genomic_bench_dataset import GenomicBenchmarkDataset
from src.dataloaders.datasets.hg38_char_tokenizer import CharacterTokenizer
from src.dataloaders.datasets.hg38_dataset import HG38Dataset
from src.dataloaders.datasets.nucleotide_transformer_dataset import NucleotideTransformerDataset
from src.dataloaders.fault_tolerant_sampler import FaultTolerantDistributedSampler
from src.dataloaders.fault_tolerant_sampler import RandomFaultTolerantSampler

logger = src.utils.train.get_logger(__name__)
from caduceus.huggingface.tokenization_caduceus import CaduceusTokenizer
import caduceus.utils.train
from caduceus.dataloaders.base import SequenceDataset, default_data_path
from caduceus.dataloaders.datasets.genomic_bench_dataset import GenomicBenchmarkDataset
from caduceus.dataloaders.datasets.hg38_char_tokenizer import CharacterTokenizer
from caduceus.dataloaders.datasets.hg38_dataset import HG38Dataset
from caduceus.dataloaders.datasets.nucleotide_transformer_dataset import NucleotideTransformerDataset
from caduceus.dataloaders.fault_tolerant_sampler import FaultTolerantDistributedSampler
from caduceus.dataloaders.fault_tolerant_sampler import RandomFaultTolerantSampler

logger = caduceus.utils.train.get_logger(__name__)


class HG38(SequenceDataset):
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@
except ImportError:
RMSNorm, layer_norm_fn, rms_norm_fn = None, None, None

from caduceus.modeling_rcps import (
from caduceus.huggingface.modeling_rcps import (
RCPSEmbedding, RCPSAddNormWrapper, RCPSLMHead, RCPSWrapper
)

from caduceus.modeling_caduceus import CaduceusConfig, CaduceusMixerModel, CaduceusForMaskedLM, create_block
from caduceus.huggingface.modeling_caduceus import CaduceusConfig, CaduceusMixerModel, CaduceusForMaskedLM, create_block


@pytest.mark.parametrize("batch_size", [4])
Expand Down
Empty file added src/caduceus/models/__init__.py
Empty file.
Empty file.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@
ColumnParallelLinear = None


from caduceus.configuration_caduceus import CaduceusConfig
from caduceus.modeling_caduceus import Caduceus
from src.models.sequence.long_conv_lm import LMBackbone
from src.models.sequence.long_conv_lm import _init_weights
from caduceus.huggingface.configuration_caduceus import CaduceusConfig
from caduceus.huggingface.modeling_caduceus import Caduceus
from caduceus.models.sequence.long_conv_lm import LMBackbone
from caduceus.models.sequence.long_conv_lm import _init_weights


class DNAEmbeddingModel(nn.Module, GenerationMixin):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from einops import rearrange

try:
from src.ops.fftconv import fftconv_ref, fftconv_func, fftconv_heads_ref
from caduceus.ops.fftconv import fftconv_ref, fftconv_func, fftconv_heads_ref

except ImportError:
fftconv_func = None
Expand All @@ -16,10 +16,10 @@
except ImportError:
FusedDense = None

import src.utils.registry as registry
from src.utils.train import OptimModule
from src.utils.config import instantiate, auto_assign_attrs
from src.models.nn import Activation
import caduceus.utils.registry as registry
from caduceus.utils.train import OptimModule
from caduceus.utils.config import instantiate, auto_assign_attrs
from caduceus.models.nn import Activation


class FFTConvFuncv2(torch.autograd.Function):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
except ImportError:
dropout_add_layer_norm = None

from src.utils import instantiate
import src.utils.registry as registry
from caduceus.utils import instantiate
import caduceus.utils.registry as registry


class CheckpointedModule(torch.nn.Module):
Expand Down
Empty file added src/caduceus/ops/__init__.py
Empty file.
File renamed without changes.
Empty file added src/caduceus/tasks/__init__.py
Empty file.
8 changes: 4 additions & 4 deletions src/tasks/decoders.py → src/caduceus/tasks/decoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
import torch.nn as nn
import torch.nn.functional as F

import src.models.nn.utils as U
import src.utils as utils
import src.utils.train
import caduceus.models.nn.utils as U
import caduceus.utils as utils
import caduceus.utils.train

log = src.utils.train.get_logger(__name__)
log = caduceus.utils.train.get_logger(__name__)


class Decoder(nn.Module):
Expand Down
4 changes: 2 additions & 2 deletions src/tasks/encoders.py → src/caduceus/tasks/encoders.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from torch import nn

import src.models.nn.utils as U
import src.utils as utils
import caduceus.models.nn.utils as U
import caduceus.utils as utils


class Encoder(nn.Module):
Expand Down
4 changes: 2 additions & 2 deletions src/tasks/metrics.py → src/caduceus/tasks/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@

import torch
import torch.nn.functional as F
import torchmetrics.functional as tm_f
from caduceus.tasks import torchmetrics as tm_f
from sklearn.metrics import f1_score, roc_auc_score, matthews_corrcoef
from torchmetrics.classification import MulticlassRecall, MulticlassPrecision

from torchmetrics import Metric
from caduceus.tasks.torchmetrics import Metric


class CorrectAggregatedMetric(Metric):
Expand Down
12 changes: 6 additions & 6 deletions src/tasks/tasks.py → src/caduceus/tasks/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
import torch.nn as nn
from einops import rearrange

import src.models.nn.utils as U
import src.tasks.metrics as M
import torchmetrics as tm
from src.models.nn.adaptive_softmax import AdaptiveEmbedding, ProjectedAdaptiveLogSoftmax
from src.tasks.torchmetrics import torchmetric_fns as tm_mine
from src.utils.config import to_list, instantiate
import caduceus.models.nn.utils as U
import caduceus.tasks.metrics as M
from caduceus.tasks import torchmetrics as tm
from caduceus.models.nn.adaptive_softmax import AdaptiveEmbedding, ProjectedAdaptiveLogSoftmax
from caduceus.tasks.torchmetrics import torchmetric_fns as tm_mine
from caduceus.utils.config import to_list, instantiate
from torchmetrics import MetricCollection


Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
Empty file.
File renamed without changes.
File renamed without changes.
Loading