diff --git a/emmet-builders/emmet/builders/materials/electrodes.py b/emmet-builders/emmet/builders/materials/electrodes.py index effa51bcb7..b00114ec63 100644 --- a/emmet-builders/emmet/builders/materials/electrodes.py +++ b/emmet-builders/emmet/builders/materials/electrodes.py @@ -14,7 +14,7 @@ from pymatgen.analysis.phase_diagram import PhaseDiagram, Composition from emmet.core.electrode import InsertionElectrodeDoc, ConversionElectrodeDoc -from emmet.core.structure_group import StructureGroupDoc, _get_id_num +from emmet.core.structure_group import StructureGroupDoc, _get_id_lexi from emmet.core.utils import jsanitize from emmet.builders.settings import EmmetBuildSettings @@ -571,7 +571,7 @@ def process_item(self, item) -> Dict: for e in pd.entries ] material_ids = list(filter(None, material_ids)) - lowest_id = min(material_ids, key=_get_id_num) + lowest_id = min(material_ids, key=_get_id_lexi) conversion_electrode_doc = ( ConversionElectrodeDoc.from_composition_and_pd( comp=v[1], diff --git a/emmet-core/emmet/core/mpid.py b/emmet-core/emmet/core/mpid.py index 079476b8e2..77d577ab23 100644 --- a/emmet-core/emmet/core/mpid.py +++ b/emmet-core/emmet/core/mpid.py @@ -1,3 +1,4 @@ +# %% import re from typing import Union, Any, Callable @@ -7,10 +8,14 @@ from pydantic.json_schema import JsonSchemaValue +# matches "mp-1234" or "1234" followed by and optional "-(Alphanumeric)" mpid_regex = re.compile(r"^([A-Za-z]*-)?(\d+)(-[A-Za-z0-9]+)*$") mpculeid_regex = re.compile( r"^([A-Za-z]+-)?([A-Fa-f0-9]+)-([A-Za-z0-9]+)-(m?[0-9]+)-([0-9]+)$" ) +# matches capital letters and numbers of length 26 (ULID) +# followed by and optional "-(Alphanumeric)" +check_ulid = re.compile(r"^[A-Z0-9]{26}(-[A-Za-z0-9]+)*$") class MPID(str): @@ -39,14 +44,22 @@ def __init__(self, val: Union["MPID", int, str]): self.string = str(val) elif isinstance(val, str): - parts = val.split("-") - parts[1] = int(parts[1]) # type: ignore - self.parts = tuple(parts) + if mpid_regex.fullmatch(val): + parts = val.split("-") + parts[1] = int(parts[1]) # type: ignore + self.parts = tuple(parts) + elif check_ulid.fullmatch(val): + ulid = val.split("-")[0] + self.parts = (ulid, 0) + else: + raise ValueError( + "MPID string representation must follow the format prefix-number or start with a valid ULID." + ) self.string = val else: raise ValueError( - "Must provide an MPID, int, or string of the format prefix-number" + "Must provide an MPID, int, or string of the format prefix-number or start with a valid ULID." ) def __eq__(self, other: object): @@ -106,12 +119,17 @@ def validate(cls, __input_value: Any, _: core_schema.ValidationInfo): return __input_value elif isinstance(__input_value, str) and mpid_regex.fullmatch(__input_value): return MPID(__input_value) + elif isinstance(__input_value, str) and check_ulid.fullmatch(__input_value): + return MPID(__input_value) elif isinstance(__input_value, int): return MPID(__input_value) raise ValueError("Invalid MPID Format") +# %% + + class MPculeID(str): """ A Materials Project Molecule ID with a prefix, hash, and two integer values diff --git a/emmet-core/emmet/core/structure_group.py b/emmet-core/emmet/core/structure_group.py index 1a8bae2f02..d124fd4e4f 100644 --- a/emmet-core/emmet/core/structure_group.py +++ b/emmet-core/emmet/core/structure_group.py @@ -10,6 +10,7 @@ from pymatgen.core.composition import Composition from pymatgen.entries.computed_entries import ComputedEntry, ComputedStructureEntry from pymatgen.symmetry.analyzer import SpacegroupAnalyzer +from emmet.core.mpid import MPID logger = logging.getLogger(__name__) @@ -127,7 +128,7 @@ def from_grouped_entries( framework_comp = Composition.from_dict(comp_d) framework_str = framework_comp.reduced_formula ids = [ient.data["material_id"] for ient in entries] - lowest_id = min(ids, key=_get_id_num) + lowest_id = min(ids, key=_get_id_lexi) sub_script = "_".join([ignored_specie]) host_and_insertion_ids = StructureGroupDoc.get_host_and_insertion_ids( entries=entries, ignored_specie=ignored_specie @@ -278,13 +279,11 @@ def get_num_sym_ops(ent): yield [el for el in sub_g] -def _get_id_num(task_id) -> Union[int, str]: - if isinstance(task_id, int): - return task_id - if isinstance(task_id, str): - return int(task_id.split("-")[-1]) - else: - raise ValueError("TaskID needs to be either a number or of the form xxx-#####") +def _get_id_lexi(task_id) -> Union[int, str]: + """Get a lexicographic representation for a task ID""" + # matches "mp-1234" or "1234" followed by and optional "-(Alphanumeric)" + mpid = MPID(task_id) + return mpid.parts def _get_framework(formula, ignored_specie) -> str: diff --git a/emmet-core/tests/test_mpid.py b/emmet-core/tests/test_mpid.py index bedc177ab4..465b5c1896 100644 --- a/emmet-core/tests/test_mpid.py +++ b/emmet-core/tests/test_mpid.py @@ -1,4 +1,5 @@ from emmet.core.mpid import MPID, MPculeID +import pytest def test_mpid(): @@ -24,6 +25,10 @@ def test_mpid(): ) MPID(3) + ulid_mpid = MPID("01HMVV88CCQ6JQ2Y1N8F3ZTVWP-Li") + assert ulid_mpid.parts == ("01HMVV88CCQ6JQ2Y1N8F3ZTVWP", 0) + with pytest.raises(ValueError, match="MPID string representation must follow"): + MPID("GGIRADF") def test_mpculeid(): diff --git a/emmet-core/tests/test_structure_group.py b/emmet-core/tests/test_structure_group.py index 51f0d91265..3753b9291c 100644 --- a/emmet-core/tests/test_structure_group.py +++ b/emmet-core/tests/test_structure_group.py @@ -2,7 +2,7 @@ from monty.serialization import loadfn from pymatgen.core import Composition -from emmet.core.structure_group import StructureGroupDoc +from emmet.core.structure_group import StructureGroupDoc, _get_id_lexi @pytest.fixture(scope="session") @@ -55,3 +55,12 @@ def test_StructureGroupDoc_from_ungrouped_entries(entries_lfeo): dd_.pop(ignored) framework = Composition.from_dict(dd_).reduced_formula assert framework == framework_ref + + +def test_lexi_id(): + assert _get_id_lexi("01HMVV88CCQ6JQ2Y1N8F3ZTVWP") == ( + "01HMVV88CCQ6JQ2Y1N8F3ZTVWP", + 0, + ) + assert _get_id_lexi("mp-123") == ("mp", 123) + assert _get_id_lexi("123") == ("", 123)