Skip to content
This repository has been archived by the owner on Jan 2, 2025. It is now read-only.

🏗️ Prepare v0.4.0: add indexes and remove dtransform_out #15

Merged
merged 7 commits into from
Aug 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions docs/guides/models.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,15 @@
"id": "2ee23aa0-22e1-45ae-8fe4-392702f95de5",
"metadata": {},
"source": [
"# All models"
"# Models"
]
},
{
"cell_type": "markdown",
"id": "fc3e116f",
"metadata": {},
"source": [
"For guidance on using the schema module, see the documentation of [lamindb](https://lamin.ai/docs/lamindb)."
]
},
{
Expand All @@ -19,7 +27,6 @@
" dobject,\n",
" dtransform,\n",
" dtransform_in,\n",
" dtransform_out,\n",
" jupynb,\n",
" version_yvzi,\n",
" usage,\n",
Expand Down Expand Up @@ -97,14 +104,6 @@
"source": [
"storage()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b4568d90",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
22 changes: 16 additions & 6 deletions lnschema_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,33 @@

import lnschema_core

All models:
Main tables:

.. autosummary::
:toctree: .

user
dobject
dtransform
dtransform_in
dtransform_out
user
usage

Data transformations:

.. autosummary::
:toctree: .

jupynb
pipeline_run
usage

Tracking migrations:

.. autosummary::
:toctree: .

version_yvzi

Helpers:
Non-table helper functionality:

.. autosummary::
:toctree: .
Expand All @@ -37,7 +48,6 @@
dobject,
dtransform,
dtransform_in,
dtransform_out,
jupynb,
pipeline_run,
storage,
Expand Down
167 changes: 120 additions & 47 deletions lnschema_core/_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,29 +12,42 @@ def utcnow():


class version_yvzi(SQLModel, table=True): # type: ignore
"""Schema module version."""
"""Core schema module versions deployed in a given instance.

Migrations of the schema module add rows to this table, storing the schema
module version to which we migrated along with the user who performed the
migration.
"""

v: Optional[str] = Field(primary_key=True)
user_id: str = Field(foreign_key="user.id")
time_created: datetime = Field(default_factory=utcnow, nullable=False)


class user(SQLModel, table=True): # type: ignore
"""Users operating a given LaminDB instance."""
"""Users operating a given LaminDB instance.

The All data here is always synched from the corresponding table in the hub.
"""

__table_args__ = (
UniqueConstraint("email"),
UniqueConstraint("handle"),
)
id: Optional[str] = Field(primary_key=True)
email: str
handle: str = Field(nullable=False)
email: str = Field(index=True)
handle: str = Field(nullable=False, index=True)
time_created: datetime = Field(default_factory=utcnow, nullable=False)
time_updated: datetime = Field(default_factory=utcnow, nullable=False)


class storage(SQLModel, table=True): # type: ignore
"""Storage used by a given LaminDB instance."""
"""Storage locations.

A dobject or dtransform-associated file can be stored in any desired S3,
GCP, Azure or local storage location. This table tracks these locations
along with metadata.
"""

root: str = Field(primary_key=True)
region: Optional[str]
Expand All @@ -46,28 +59,73 @@ class storage(SQLModel, table=True): # type: ignore
class dobject(SQLModel, table=True): # type: ignore
"""Data objects in storage & memory.

Storage ⟷ memory examples:
Data objects (`dobjects`) always represent a dataset, a set of jointly measured
observations of variables (features).

A `dobject` might contain a single observation, for instance, a single image.

Datasets typically have canonical on-disk and in-memory representations. If
choices among these representations are made, a one-to-one mapping can be
achieved, which means that any given `dobject` has a default in-memory and
on-disk representation.

LaminDB offers meaningful default representations. For instance,

- It defaults to pandas DataFrames for in-memory representation of tables
and allows you to configure loading tables into polars DataFrames.
- It defaults to the `.parquet` format for tables, but allows you to
configure `.csv` or `.ipc`.

- Table: `.csv`, `.tsv`, `.feather`, `.parquet` ⟷ `pd.DataFrame`
- Annotated matrix: `.h5ad`, `.h5mu`, `.zarrad` ⟷ `anndata.AnnData`, `mudata.MuData`
- Image: `.jpg`, `.png` ⟷ `np.ndarray`, or a dedicated imaging in-memory container
Some datasets do not have a canonical in-memory representation, for
instance, `.fastq`, `.vcf`, or files describing QC of datasets.

Examples for storage ⟷ memory correspondence:

- Table: `.csv`, `.tsv`, `.parquet`, `.ipc` (`.feather`) ⟷
`pandas.DataFrame`, `polars.DataFrame`
- Annotated matrix: `.h5ad`, `.h5mu`, `.zarrad` ⟷ `anndata.AnnData`,
`mudata.MuData`
- Image: `.jpg`, `.png` ⟷ `np.ndarray`, or a dedicated imaging in-memory
container
- Tensor: zarr directory, TileDB store ⟷ zarr loader, TileDB loader
- Fastq: fastq ⟷ /
- VCF: .vcf ⟷ /
- Fastq: `.fastq` ⟷ /
- VCF: `.vcf` ⟷ /
- QC: `.html` ⟷ /
"""

id: Optional[str] = Field(default_factory=id_dobject, primary_key=True)
v: str = Field(default=None, primary_key=True)
name: Optional[str]
file_suffix: str
dsource_id: str = Field(foreign_key="dtransform.id")
storage_root: str = Field(foreign_key="storage.root")
time_created: datetime = Field(default_factory=utcnow, nullable=False)
time_updated: datetime = Field(default_factory=utcnow, nullable=False)
name: Optional[str] = Field(index=True)
file_suffix: str = Field(index=True)
dtransform_id: str = Field(foreign_key="dtransform.id", index=True)
storage_root: str = Field(foreign_key="storage.root", index=True)
time_created: datetime = Field(default_factory=utcnow, nullable=False, index=True)
time_updated: datetime = Field(default_factory=utcnow, nullable=False, index=True)


class dtransform(SQLModel, table=True): # type: ignore
"""Data transformations."""
"""Data transformations.

A data transformation (`dtransform`) is _any_ transformation of a `dobject`.
For instance:

- Jupyter notebooks (`jupynb`)
- Pipeline runs of software (workflows) and scripts (`pipeline_run`).
- Physical instruments making measurements (needs to be configured).
- Human decisions based on data visualizations (needs to be configured).

It typically has inputs and outputs:

- References to outputs are stored in the `dobject` table in the
`dtransform_id` column, which stores a foreign key into the `dtransform`
table. This is possible as every given `dobject` has a unique data source:
the `dtransform` that produced the `dobject`. Note that a given
`dtransform` may output several `dobjects`.
- References to input `dobjects` are stored in the `dtransform_in` table, a
many-to-many link table between the `dobject` and `dtransform` tables. Any
`dobject` might serve as an input for many `dtransform`. Similarly, any
`dtransform` might have many `dobjects` as inputs.
"""

__table_args__ = (
ForeignKeyConstraint(
Expand All @@ -77,36 +135,30 @@ class dtransform(SQLModel, table=True): # type: ignore
),
)
id: str = Field(default_factory=id_dtransform, primary_key=True)
jupynb_id: Union[str, None] = None
jupynb_v: Union[str, None] = None
jupynb_id: Union[str, None] = Field(default=None, index=True)
jupynb_v: Union[str, None] = Field(default=None, index=True)
pipeline_run_id: Union[str, None] = Field(
default=None, foreign_key="pipeline_run.id"
default=None, foreign_key="pipeline_run.id", index=True
)


class dtransform_in(SQLModel, table=True): # type: ignore
"""Inputs - link dtransform & dobject."""
"""Input data for data transformations.

__table_args__ = (
ForeignKeyConstraint(
["dobject_id", "dobject_v"],
["dobject.id", "dobject.v"],
name="dtransform_in_dobject",
),
)
dtransform_id: str = Field(foreign_key="dtransform.id", primary_key=True)
dobject_id: str = Field(primary_key=True)
dobject_v: str = Field(primary_key=True)
This is a many-to-many link table for `dtransform` and `dobject` storing the
inputs of data transformations.

A data transformation can have an arbitrary number of data objects as inputs.

class dtransform_out(SQLModel, table=True): # type: ignore
"""Outputs - link dtransform & dobject."""
- The same `dobject` can be used as input in many different `dtransforms`.
- One `dtransform` can have several `dobjects` as inputs.
"""

__table_args__ = (
ForeignKeyConstraint(
["dobject_id", "dobject_v"],
["dobject.id", "dobject.v"],
name="dtransform_out_dobject",
name="dtransform_in_dobject",
),
)
dtransform_id: str = Field(foreign_key="dtransform.id", primary_key=True)
Expand All @@ -115,18 +167,36 @@ class dtransform_out(SQLModel, table=True): # type: ignore


class jupynb(SQLModel, table=True): # type: ignore
"""Jupyter notebooks."""
"""Jupyter notebooks.

Jupyter notebooks (`jupynbs`) represent one type of data transformation
(`dtransform`) and have a unique correspondence in `dtransform`.

IDs for Jupyter notebooks are generated through nbproject.
"""

id: str = Field(default=None, primary_key=True)
v: str = Field(default=None, primary_key=True)
name: Optional[str]
user_id: str = Field(foreign_key="user.id")
time_created: datetime = Field(default_factory=utcnow, nullable=False)
time_updated: datetime = Field(default_factory=utcnow, nullable=False)
name: Optional[str] = Field(index=True)
user_id: str = Field(foreign_key="user.id", index=True)
time_created: datetime = Field(default_factory=utcnow, nullable=False, index=True)
time_updated: datetime = Field(default_factory=utcnow, nullable=False, index=True)


class pipeline_run(SQLModel, table=True): # type: ignore
"""Pipeline runs."""
"""Pipeline runs.

Pipeline runs represent one type of data transformation (`dtransform`) and
have a unique correspondence in `dtransform`.

A pipeline is typically versioned software that can perform a data
transformation/processing workflow. This can be anything from typical
workflow tools (Nextflow, snakemake, prefect, Apache Airflow, etc.) to
simple (versioned) scripts.

For instance, `lnbfx` stores references to bioinformatics workflow runs by
linking to entries in this table.
"""

id: str = Field(default=None, primary_key=True)

Expand All @@ -148,7 +218,10 @@ class usage_type(str, Enum):


class usage(SQLModel, table=True): # type: ignore
"""Data usage log: do operations on the database."""
"""Data usage log.

Any API call in the `lamindb.do` API is logged here.
"""

__table_args__ = (
ForeignKeyConstraint(
Expand All @@ -159,8 +232,8 @@ class usage(SQLModel, table=True): # type: ignore
)

id: Optional[str] = Field(default_factory=id_usage, primary_key=True)
type: usage_type = Field(nullable=False)
user_id: str = Field(foreign_key="user.id", nullable=False)
time: datetime = Field(default_factory=utcnow, nullable=False)
dobject_id: str
dobject_v: str
type: usage_type = Field(nullable=False, index=True)
user_id: str = Field(foreign_key="user.id", nullable=False, index=True)
time: datetime = Field(default_factory=utcnow, nullable=False, index=True)
dobject_id: str = Field(index=True)
dobject_v: str = Field(index=True)
2 changes: 2 additions & 0 deletions lnschema_core/id.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
"""ID generators."""

import random
import string

Expand Down