Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🚚 Rename FeatureValue to Category #230

Merged
merged 9 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lnschema_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
from . import ids, types
from .models import ( # type: ignore
ORM,
Category,
Dataset,
Feature,
FeatureSet,
FeatureValue,
File,
Project,
Run,
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Generated by Django 4.2.2 on 2023-07-19 20:05

import django.db.models.deletion
from django.db import migrations, models

import lnschema_core.ids
import lnschema_core.users


class Migration(migrations.Migration):
dependencies = [
("lnschema_core", "0008_file_hash_type_transform_parents"),
]

operations = [
migrations.AddField(
model_name="file",
name="feature_sets",
field=models.ManyToManyField(related_name="files", to="lnschema_core.featureset"),
),
migrations.RunSQL("insert into lnschema_core_file_feature_sets (id, file_id, featureset_id) select id, file_id, featureset_id from lnschema_core_featureset_files"),
migrations.RemoveField(
model_name="featureset",
name="files",
),
migrations.AddField(
model_name="feature",
name="unit",
field=models.CharField(db_index=True, default=None, max_length=30, null=True),
),
migrations.AlterField(
model_name="feature",
name="description",
field=models.TextField(db_index=True, default=None, null=True),
),
migrations.AlterField(
model_name="feature",
name="type",
field=models.CharField(db_index=True, default=None, max_length=96, null=True),
),
migrations.AlterField(
model_name="featureset",
name="field",
field=models.CharField(max_length=64),
),
migrations.AlterField(
model_name="run",
name="created_by",
field=models.ForeignKey(
default=lnschema_core.users.current_user_id, on_delete=django.db.models.deletion.CASCADE, related_name="created_runs", to="lnschema_core.user"
),
),
migrations.AlterField(
model_name="run",
name="transform",
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="runs", to="lnschema_core.transform"),
),
migrations.CreateModel(
name="Category",
fields=[
("id", models.CharField(default=lnschema_core.ids.base62_12, max_length=12, primary_key=True, serialize=False)),
("name", models.CharField(db_index=True, max_length=128)),
("created_at", models.DateTimeField(auto_now_add=True, db_index=True)),
("updated_at", models.DateTimeField(auto_now=True, db_index=True)),
(
"created_by",
models.ForeignKey(
default=lnschema_core.users.current_user_id, on_delete=django.db.models.deletion.PROTECT, related_name="created_categories", to="lnschema_core.user"
),
),
("feature", models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name="categories", to="lnschema_core.feature")),
],
options={
"unique_together": {("name", "feature")},
},
),
migrations.AddField(
model_name="featureset",
name="schema",
field=models.CharField(default="core", max_length=64),
preserve_default=False,
),
migrations.AlterField(
model_name="featureset",
name="field",
field=models.CharField(db_index=True, max_length=64),
),
migrations.AlterField(
model_name="featureset",
name="schema",
field=models.CharField(db_index=True, max_length=64),
),
migrations.AlterField(
model_name="featureset",
name="type",
field=models.CharField(db_index=True, max_length=64),
),
]
67 changes: 43 additions & 24 deletions lnschema_core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,14 +835,11 @@ class Feature(ORM):
"""Universal id, valid across DB instances."""
name = models.CharField(max_length=255, db_index=True, default=None)
"""Name of feature (required)."""
type = models.CharField(max_length=96, null=True, default=None)
type = models.CharField(max_length=96, db_index=True, null=True, default=None)
"""Type. If an ORM, is formatted as ``"{schema_name}{ORM.__name__}"``."""
unit = models.CharField(max_length=30, null=True, default=None)
unit = models.CharField(max_length=30, db_index=True, null=True, default=None)
"""Unit of measure, ideally SI, e.g., `m`, `s`, `kg`, etc."""
# values through FeatureValue
field = models.CharField(max_length=64, null=True, default=None)
"""If type is an ORM, the corresponding field."""
description = models.TextField(null=True, default=None)
description = models.TextField(db_index=True, null=True, default=None)
"""A description."""
synonyms = models.TextField(null=True, default=None)
"""Bar-separated (|) synonyms."""
Expand Down Expand Up @@ -895,15 +892,25 @@ def save(self, *args, **kwargs) -> None:
class FeatureSet(ORM):
"""Jointly measured sets of features.

A `feature_set` is represented by the hash of the id set for the feature type.
.. note::

A `FeatureSet` is a useful entity as you might have millions of data batches
that measure the same features: All of them would link against a single
feature set. If instead, you'd link against single features (say, genes),
you'd face exploding link tables.

Guides:
A `feature_set` is identified by the hash of the id set for the feature type.

- :doc:`/biology/scrna`
- :doc:`/biology/flow`
Notes:

- :doc:`/biology/scrna`
- :doc:`/biology/flow`

Examples:

>>> df = pd.DataFrame({"feat1": [1, 2], "feat2": [3.1, 4.2], "feat3": ["cond1", "cond2"]})
>>> feature_set = ln.FeatureSet.from_df(df)

>>> features = ln.Feature.from_values(["feat1", "feat2"])
>>> ln.FeatureSet(features)

Expand All @@ -918,9 +925,11 @@ class FeatureSet(ORM):

id = models.CharField(max_length=20, primary_key=True, default=None)
"""A universal id (hash of the set of feature identifiers)."""
type = models.CharField(max_length=64)
"""Type formatted as ``"{schema_name}{ORM.__name__}"``."""
field = models.CharField(max_length=64)
type = models.CharField(max_length=64, db_index=True)
"""Type, the ORM name."""
schema = models.CharField(max_length=64, db_index=True)
"""The schema where the ORM is defined."""
field = models.CharField(max_length=64, db_index=True)
"""Field of ORM that was hashed."""
created_at = models.DateTimeField(auto_now_add=True, db_index=True)
"""Time of creation of record."""
Expand Down Expand Up @@ -982,21 +991,31 @@ def save(self, *args, **kwargs) -> None:
"""Save."""


class FeatureValue(ORM):
"""Categorical values of features.
class Category(ORM):
"""Categories of categorical features.

Stores values for feature types that don't have a dedicated ORM.
This is the default registry for tracking categories of categorical features.

Is analogous to, say, the `Gene` ORM in `lnschema_bionty`.
If you're working a lot with different cell lines, proteins, genes, or other
entities of complexity, consider using the pre-defined biological registries
in :mod:`lnschema_bionty`.
"""

feature = models.ForeignKey(Feature, CASCADE, related_name="values")
id = models.CharField(max_length=12, default=base62_12, primary_key=True)
"""Universal id, valid across DB instances."""
name = models.CharField(max_length=128, db_index=True)
"""The name or the string value of the category."""
feature = models.ForeignKey(Feature, CASCADE, related_name="categories")
"""Feature."""
value = models.CharField(max_length=128)
"""Value."""
created_at = models.DateTimeField(auto_now_add=True, db_index=True)
"""Time of creation of record."""
updated_at = models.DateTimeField(auto_now=True, db_index=True)
"""Time of last update to record."""
created_by = models.ForeignKey(User, PROTECT, default=current_user_id, related_name="created_categories")
"""Creator of record, a :class:`~lamindb.User`."""

class Meta:
unique_together = (("feature", "value"),)
unique_together = (("name", "feature"),)


class File(ORM):
Expand All @@ -1016,8 +1035,8 @@ class File(ORM):

Track where files come from by passing the generating :class:`~lamindb.Run`.

Often, files store jointly measured observations of features: track them
with :class:`~lamindb.FeatureSet`.
Often, files store jointly measured features: track them with
:class:`~lamindb.FeatureSet`.

If files have corresponding representations in storage and memory, LaminDB
makes some configurable default choices (e.g., serialize a `DataFrame` as a
Expand Down Expand Up @@ -1184,7 +1203,7 @@ def from_df(
>>> file
File(id=kV3JQuBw4izvUdAkjO4p, suffix=.parquet, description=Iris flower dataset batch1, size=5334, hash=RraiKH9BAtAgS5jg7LWUiA, hash_type=md5, storage_id=Zl2q0vQB, created_by_id=DzTjkKse) # noqa
>>> file.save()
💬 Created 2 FeatureValue records with a single field value
💬 Created 2 Category records with a single field value
💡 storing file kV3JQuBw4izvUdAkjO4p with key .lamindb/kV3JQuBw4izvUdAkjO4p.parquet
"""
pass
Expand Down