Skip to content

Commit

Permalink
Allow ULID for MPIDS (#928)
Browse files Browse the repository at this point in the history
* ulid changes

* ulid changes

* ulid tests

* ulid tests

* ulid tests
  • Loading branch information
jmmshn committed Jan 23, 2024
1 parent 4372799 commit d7c8036
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 15 deletions.
4 changes: 2 additions & 2 deletions emmet-builders/emmet/builders/materials/electrodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from pymatgen.analysis.phase_diagram import PhaseDiagram, Composition

from emmet.core.electrode import InsertionElectrodeDoc, ConversionElectrodeDoc
from emmet.core.structure_group import StructureGroupDoc, _get_id_num
from emmet.core.structure_group import StructureGroupDoc, _get_id_lexi
from emmet.core.utils import jsanitize
from emmet.builders.settings import EmmetBuildSettings

Expand Down Expand Up @@ -571,7 +571,7 @@ def process_item(self, item) -> Dict:
for e in pd.entries
]
material_ids = list(filter(None, material_ids))
lowest_id = min(material_ids, key=_get_id_num)
lowest_id = min(material_ids, key=_get_id_lexi)
conversion_electrode_doc = (
ConversionElectrodeDoc.from_composition_and_pd(
comp=v[1],
Expand Down
26 changes: 22 additions & 4 deletions emmet-core/emmet/core/mpid.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# %%
import re
from typing import Union, Any, Callable

Expand All @@ -7,10 +8,14 @@
from pydantic.json_schema import JsonSchemaValue


# matches "mp-1234" or "1234" followed by and optional "-(Alphanumeric)"
mpid_regex = re.compile(r"^([A-Za-z]*-)?(\d+)(-[A-Za-z0-9]+)*$")
mpculeid_regex = re.compile(
r"^([A-Za-z]+-)?([A-Fa-f0-9]+)-([A-Za-z0-9]+)-(m?[0-9]+)-([0-9]+)$"
)
# matches capital letters and numbers of length 26 (ULID)
# followed by and optional "-(Alphanumeric)"
check_ulid = re.compile(r"^[A-Z0-9]{26}(-[A-Za-z0-9]+)*$")


class MPID(str):
Expand Down Expand Up @@ -39,14 +44,22 @@ def __init__(self, val: Union["MPID", int, str]):
self.string = str(val)

elif isinstance(val, str):
parts = val.split("-")
parts[1] = int(parts[1]) # type: ignore
self.parts = tuple(parts)
if mpid_regex.fullmatch(val):
parts = val.split("-")
parts[1] = int(parts[1]) # type: ignore
self.parts = tuple(parts)
elif check_ulid.fullmatch(val):
ulid = val.split("-")[0]
self.parts = (ulid, 0)
else:
raise ValueError(
"MPID string representation must follow the format prefix-number or start with a valid ULID."
)
self.string = val

else:
raise ValueError(
"Must provide an MPID, int, or string of the format prefix-number"
"Must provide an MPID, int, or string of the format prefix-number or start with a valid ULID."
)

def __eq__(self, other: object):
Expand Down Expand Up @@ -106,12 +119,17 @@ def validate(cls, __input_value: Any, _: core_schema.ValidationInfo):
return __input_value
elif isinstance(__input_value, str) and mpid_regex.fullmatch(__input_value):
return MPID(__input_value)
elif isinstance(__input_value, str) and check_ulid.fullmatch(__input_value):
return MPID(__input_value)
elif isinstance(__input_value, int):
return MPID(__input_value)

raise ValueError("Invalid MPID Format")


# %%


class MPculeID(str):
"""
A Materials Project Molecule ID with a prefix, hash, and two integer values
Expand Down
15 changes: 7 additions & 8 deletions emmet-core/emmet/core/structure_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pymatgen.core.composition import Composition
from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
from emmet.core.mpid import MPID

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -127,7 +128,7 @@ def from_grouped_entries(
framework_comp = Composition.from_dict(comp_d)
framework_str = framework_comp.reduced_formula
ids = [ient.data["material_id"] for ient in entries]
lowest_id = min(ids, key=_get_id_num)
lowest_id = min(ids, key=_get_id_lexi)
sub_script = "_".join([ignored_specie])
host_and_insertion_ids = StructureGroupDoc.get_host_and_insertion_ids(
entries=entries, ignored_specie=ignored_specie
Expand Down Expand Up @@ -278,13 +279,11 @@ def get_num_sym_ops(ent):
yield [el for el in sub_g]


def _get_id_num(task_id) -> Union[int, str]:
if isinstance(task_id, int):
return task_id
if isinstance(task_id, str):
return int(task_id.split("-")[-1])
else:
raise ValueError("TaskID needs to be either a number or of the form xxx-#####")
def _get_id_lexi(task_id) -> Union[int, str]:
"""Get a lexicographic representation for a task ID"""
# matches "mp-1234" or "1234" followed by and optional "-(Alphanumeric)"
mpid = MPID(task_id)
return mpid.parts


def _get_framework(formula, ignored_specie) -> str:
Expand Down
5 changes: 5 additions & 0 deletions emmet-core/tests/test_mpid.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from emmet.core.mpid import MPID, MPculeID
import pytest


def test_mpid():
Expand All @@ -24,6 +25,10 @@ def test_mpid():
)

MPID(3)
ulid_mpid = MPID("01HMVV88CCQ6JQ2Y1N8F3ZTVWP-Li")
assert ulid_mpid.parts == ("01HMVV88CCQ6JQ2Y1N8F3ZTVWP", 0)
with pytest.raises(ValueError, match="MPID string representation must follow"):
MPID("GGIRADF")


def test_mpculeid():
Expand Down
11 changes: 10 additions & 1 deletion emmet-core/tests/test_structure_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from monty.serialization import loadfn
from pymatgen.core import Composition

from emmet.core.structure_group import StructureGroupDoc
from emmet.core.structure_group import StructureGroupDoc, _get_id_lexi


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -55,3 +55,12 @@ def test_StructureGroupDoc_from_ungrouped_entries(entries_lfeo):
dd_.pop(ignored)
framework = Composition.from_dict(dd_).reduced_formula
assert framework == framework_ref


def test_lexi_id():
assert _get_id_lexi("01HMVV88CCQ6JQ2Y1N8F3ZTVWP") == (
"01HMVV88CCQ6JQ2Y1N8F3ZTVWP",
0,
)
assert _get_id_lexi("mp-123") == ("mp", 123)
assert _get_id_lexi("123") == ("", 123)

0 comments on commit d7c8036

Please sign in to comment.