Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow ULID for MPIDS #928

Merged
merged 5 commits into from
Jan 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions emmet-builders/emmet/builders/materials/electrodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pymatgen.analysis.phase_diagram import PhaseDiagram, Composition

from emmet.core.electrode import InsertionElectrodeDoc, ConversionElectrodeDoc
from emmet.core.structure_group import StructureGroupDoc, _get_id_num
from emmet.core.structure_group import StructureGroupDoc, _get_id_lexi
from emmet.core.utils import jsanitize
from emmet.builders.settings import EmmetBuildSettings

Expand Down Expand Up @@ -571,7 +571,7 @@ def process_item(self, item) -> Dict:
for e in pd.entries
]
material_ids = list(filter(None, material_ids))
lowest_id = min(material_ids, key=_get_id_num)
lowest_id = min(material_ids, key=_get_id_lexi)
conversion_electrode_doc = (
ConversionElectrodeDoc.from_composition_and_pd(
comp=v[1],
Expand Down
26 changes: 22 additions & 4 deletions emmet-core/emmet/core/mpid.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %%
import re
from typing import Union, Any, Callable

Expand All @@ -7,10 +8,14 @@
from pydantic.json_schema import JsonSchemaValue


# matches "mp-1234" or "1234" followed by and optional "-(Alphanumeric)"
mpid_regex = re.compile(r"^([A-Za-z]*-)?(\d+)(-[A-Za-z0-9]+)*$")
mpculeid_regex = re.compile(
r"^([A-Za-z]+-)?([A-Fa-f0-9]+)-([A-Za-z0-9]+)-(m?[0-9]+)-([0-9]+)$"
)
# matches capital letters and numbers of length 26 (ULID)
# followed by and optional "-(Alphanumeric)"
check_ulid = re.compile(r"^[A-Z0-9]{26}(-[A-Za-z0-9]+)*$")


class MPID(str):
Expand Down Expand Up @@ -39,14 +44,22 @@ def __init__(self, val: Union["MPID", int, str]):
self.string = str(val)

elif isinstance(val, str):
parts = val.split("-")
parts[1] = int(parts[1]) # type: ignore
self.parts = tuple(parts)
if mpid_regex.fullmatch(val):
parts = val.split("-")
parts[1] = int(parts[1]) # type: ignore
self.parts = tuple(parts)
elif check_ulid.fullmatch(val):
ulid = val.split("-")[0]
self.parts = (ulid, 0)
else:
raise ValueError(
"MPID string representation must follow the format prefix-number or start with a valid ULID."
)
self.string = val

else:
raise ValueError(
"Must provide an MPID, int, or string of the format prefix-number"
"Must provide an MPID, int, or string of the format prefix-number or start with a valid ULID."
)

def __eq__(self, other: object):
Expand Down Expand Up @@ -106,12 +119,17 @@ def validate(cls, __input_value: Any, _: core_schema.ValidationInfo):
return __input_value
elif isinstance(__input_value, str) and mpid_regex.fullmatch(__input_value):
return MPID(__input_value)
elif isinstance(__input_value, str) and check_ulid.fullmatch(__input_value):
return MPID(__input_value)
elif isinstance(__input_value, int):
return MPID(__input_value)

raise ValueError("Invalid MPID Format")


# %%


class MPculeID(str):
"""
A Materials Project Molecule ID with a prefix, hash, and two integer values
Expand Down
15 changes: 7 additions & 8 deletions emmet-core/emmet/core/structure_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pymatgen.core.composition import Composition
from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
from emmet.core.mpid import MPID

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -127,7 +128,7 @@ def from_grouped_entries(
framework_comp = Composition.from_dict(comp_d)
framework_str = framework_comp.reduced_formula
ids = [ient.data["material_id"] for ient in entries]
lowest_id = min(ids, key=_get_id_num)
lowest_id = min(ids, key=_get_id_lexi)
sub_script = "_".join([ignored_specie])
host_and_insertion_ids = StructureGroupDoc.get_host_and_insertion_ids(
entries=entries, ignored_specie=ignored_specie
Expand Down Expand Up @@ -278,13 +279,11 @@ def get_num_sym_ops(ent):
yield [el for el in sub_g]


def _get_id_num(task_id) -> Union[int, str]:
if isinstance(task_id, int):
return task_id
if isinstance(task_id, str):
return int(task_id.split("-")[-1])
else:
raise ValueError("TaskID needs to be either a number or of the form xxx-#####")
def _get_id_lexi(task_id) -> Union[int, str]:
"""Get a lexicographic representation for a task ID"""
# matches "mp-1234" or "1234" followed by and optional "-(Alphanumeric)"
mpid = MPID(task_id)
return mpid.parts


def _get_framework(formula, ignored_specie) -> str:
Expand Down
5 changes: 5 additions & 0 deletions emmet-core/tests/test_mpid.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from emmet.core.mpid import MPID, MPculeID
import pytest


def test_mpid():
Expand All @@ -24,6 +25,10 @@ def test_mpid():
)

MPID(3)
ulid_mpid = MPID("01HMVV88CCQ6JQ2Y1N8F3ZTVWP-Li")
assert ulid_mpid.parts == ("01HMVV88CCQ6JQ2Y1N8F3ZTVWP", 0)
with pytest.raises(ValueError, match="MPID string representation must follow"):
MPID("GGIRADF")


def test_mpculeid():
Expand Down
11 changes: 10 additions & 1 deletion emmet-core/tests/test_structure_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from monty.serialization import loadfn
from pymatgen.core import Composition

from emmet.core.structure_group import StructureGroupDoc
from emmet.core.structure_group import StructureGroupDoc, _get_id_lexi


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -55,3 +55,12 @@ def test_StructureGroupDoc_from_ungrouped_entries(entries_lfeo):
dd_.pop(ignored)
framework = Composition.from_dict(dd_).reduced_formula
assert framework == framework_ref


def test_lexi_id():
assert _get_id_lexi("01HMVV88CCQ6JQ2Y1N8F3ZTVWP") == (
"01HMVV88CCQ6JQ2Y1N8F3ZTVWP",
0,
)
assert _get_id_lexi("mp-123") == ("mp", 123)
assert _get_id_lexi("123") == ("", 123)