Skip to content

Commit

Permalink
Merge pull request #104 from atlanhq/AM-118
Browse files Browse the repository at this point in the history
Search simplifications and optimizations
  • Loading branch information
ErnestoLoma authored Aug 23, 2023
2 parents 78038f8 + c47a3af commit d7c28de
Show file tree
Hide file tree
Showing 97 changed files with 23,143 additions and 13,502 deletions.
4 changes: 4 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.6.1 (August 23, 2023)
* Adds a new FluentSearch approach to searching (no need to know Elastic anymore)
* Changes default inclusion operation for Bool queries to filter rather than must (for efficiency)

## 0.6.0 (August 17, 2023)
* Added:
* Adds new purpose policy permissions (attaching / detaching terms from assets)
Expand Down
48 changes: 48 additions & 0 deletions pyatlan/cache/custom_metadata_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class CustomMetadataCache:
"""

cache_by_id: dict[str, CustomMetadataDef] = dict()
attr_cache_by_id: dict[str, AttributeDef] = dict()
map_id_to_name: dict[str, str] = dict()
map_name_to_id: dict[str, str] = dict()
map_attr_id_to_name: dict[str, dict[str, str]] = dict()
Expand All @@ -40,6 +41,7 @@ def refresh_cache(cls) -> None:
cls.map_attr_name_to_id = {}
cls.archived_attr_ids = {}
cls.cache_by_id = {}
cls.attr_cache_by_id = {}
for cm in response.custom_metadata_defs:
type_id = cm.name
type_name = cm.display_name
Expand All @@ -53,6 +55,7 @@ def refresh_cache(cls) -> None:
attr_id = str(attr.name)
attr_name = str(attr.display_name)
cls.map_attr_id_to_name[type_id][attr_id] = attr_name
cls.attr_cache_by_id[attr_id] = attr
if attr.options and attr.options.is_archived:
cls.archived_attr_ids[attr_id] = attr_name
elif attr_name in cls.map_attr_name_to_id[type_id]:
Expand Down Expand Up @@ -213,6 +216,14 @@ def _get_attributes_for_search_results(cls, set_id: str) -> Optional[list[str]]:
return [f"{set_id}.{idstr}" for idstr in attr_ids]
return None

@classmethod
def _get_attribute_for_search_results(
cls, set_id: str, attr_name: str
) -> Optional[str]:
if sub_map := cls.map_attr_name_to_id.get(set_id):
return sub_map.get(attr_name, None)
return None

@classmethod
def get_attributes_for_search_results(cls, set_name: str) -> Optional[list[str]]:
"""
Expand All @@ -228,6 +239,25 @@ def get_attributes_for_search_results(cls, set_name: str) -> Optional[list[str]]
return cls._get_attributes_for_search_results(set_id)
return None

@classmethod
def get_attribute_for_search_results(
cls, set_name: str, attr_name: str
) -> Optional[str]:
"""
Retrieve a single custom attribute name to include on search results.
:param set_name: human-readable name of the custom metadata set for which to retrieve the custom metadata
attribute name
:param attr_name: human-readable name of the attribute
:returns: the attribute name, strictly useful for inclusion in search results
"""
if set_id := cls.get_id_for_name(set_name):
if attr_id := cls._get_attribute_for_search_results(set_id, attr_name):
return attr_id
cls.refresh_cache()
return cls._get_attribute_for_search_results(set_id, attr_name)
return None

@classmethod
def get_custom_metadata_def(cls, name: str) -> CustomMetadataDef:
"""
Expand All @@ -243,3 +273,21 @@ def get_custom_metadata_def(cls, name: str) -> CustomMetadataDef:
return typedef
else:
raise ValueError(f"No custom metadata with the name: {name} found")

@classmethod
def get_attribute_def(cls, attr_id: str) -> AttributeDef:
"""
Retrieve a specific custom metadata attribute definition by its unique Atlan-internal ID string.
:param attr_id: Atlan-internal ID string for the custom metadata attribute
:returns: attribute definition for the custom metadata attribute
"""
if not attr_id:
raise ValueError(
"No custom metadata attribute ID was provided, cannot lookup attribute definition."
)
if cls.attr_cache_by_id is None:
cls.refresh_cache()
if attr_def := cls.attr_cache_by_id.get(attr_id):
return attr_def
raise ValueError(f"No custom metadata attribute with the id: {attr_id} found")
8 changes: 4 additions & 4 deletions pyatlan/client/atlan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import time
import uuid
from abc import ABC
from typing import ClassVar, Generator, Optional, Type, TypeVar, Union
from typing import ClassVar, Generator, Iterable, Optional, Type, TypeVar, Union

import requests
from pydantic import (
Expand Down Expand Up @@ -250,7 +250,7 @@ class AtlanClient(BaseSettings):
class Config:
env_prefix = "atlan_"

class SearchResults(ABC):
class SearchResults(ABC, Iterable):
"""
Abstract class that encapsulates results returned by various searches.
"""
Expand Down Expand Up @@ -335,7 +335,7 @@ def __iter__(self) -> Generator[Asset, None, None]:
if not self.next_page():
break

class IndexSearchResults(SearchResults):
class IndexSearchResults(SearchResults, Iterable):
"""
Captures the response from a search against Atlan. Also provides the ability to
iteratively page through results, without needing to track or re-run the original
Expand Down Expand Up @@ -375,7 +375,7 @@ def _get_next_page(self):
def count(self) -> int:
return self._count

class LineageListResults(SearchResults):
class LineageListResults(SearchResults, Iterable):
"""
Captures the response from a lineage retrieval against Atlan. Also provides the ability to
iteratively page through results, without needing to track or re-run the original query.
Expand Down
134 changes: 133 additions & 1 deletion pyatlan/generator/class_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
an Atlan instance. The script create_typedefs_file.py can be used to produce this file.
"""
import datetime
import enum
import json
import os
import re
from enum import Enum
from pathlib import Path
from typing import NamedTuple, Optional
from typing import Any, NamedTuple, Optional

import networkx as nx
from jinja2 import Environment, PackageLoader
Expand Down Expand Up @@ -344,13 +345,144 @@ def create_modules(cls):
asset_info.module_info.add_asset_info(asset_info=asset_info)


def get_class_var_for_attr(attr_name: str) -> str:
replace1 = re.sub(r"([A-Z]+)([A-Z][a-z])", r"\1_\2", attr_name.replace("_", ""))
replace2 = re.sub(r"([a-z])([A-Z])", r"\1_\2", replace1)
return replace2.upper()


class IndexType(Enum):
KEYWORD = enum.auto()
TEXT = enum.auto()
RANK_FEATURE = enum.auto()
BOOLEAN = enum.auto()
NUMERIC = enum.auto()
STEMMED = enum.auto()
RELATION = enum.auto()


class SearchType:
name: str
args: Optional[str]

def __init__(self, name: str, args: Optional[str] = None):
self.name = name
self.args = args


def get_search_type(attr_def: dict[str, Any]) -> SearchType:
def get_default_index_for_type(base_type: str) -> IndexType:
if base_type in {"date", "float", "double", "int", "long"}:
to_use = IndexType.NUMERIC
elif base_type == "boolean":
to_use = IndexType.BOOLEAN
else:
to_use = IndexType.KEYWORD
return to_use

def get_embedded_type(attr_type: str) -> str:
return attr_type[attr_type.index("<") + 1 : attr_type.index(">")] # noqa: E203

def get_base_type() -> str:
type_name = str(attr_def.get("typeName"))
base_type = type_name
if "<" in type_name:
if type_name.startswith("array<"):
if type_name.startswith("array<map<"):
base_type = get_embedded_type(
type_name[len("array<") :] # noqa: E203
)
else:
base_type = get_embedded_type(type_name)
elif type_name.startswith("map<"):
base_type = get_embedded_type(type_name)
return base_type

def get_indexes_for_attribute() -> dict[IndexType, str]:
searchable: dict[IndexType, str] = {}
config = attr_def.get("indexTypeESConfig")
attr_name = str(attr_def.get("name"))
if "relationshipTypeName" in attr_def:
searchable[IndexType.RELATION] = attr_name
else:
base_type = get_base_type()
# Default index
if config:
if analyzer := config.get("analyzer"):
if analyzer == "atlan_text_analyzer":
if attr_name.endswith(".stemmed"):
searchable[IndexType.STEMMED] = attr_name
else:
searchable[IndexType.TEXT] = attr_name
else:
def_index = get_default_index_for_type(base_type)
searchable[def_index] = attr_name
# Additional indexes
if fields := attr_def.get("indexTypeESFields"):
for field_suffix in fields:
field_name = f"{attr_name}.{field_suffix}"
if index_type := fields.get(field_suffix).get("type"):
if index_type == "keyword":
searchable[IndexType.KEYWORD] = field_name
elif index_type == "text":
if field_name.endswith(".stemmed"):
searchable[IndexType.STEMMED] = field_name
else:
searchable[IndexType.TEXT] = field_name
elif index_type == "rank_feature":
searchable[IndexType.RANK_FEATURE] = field_name
else:
def_index = get_default_index_for_type(base_type)
searchable[def_index] = field_name
return searchable

search_map = get_indexes_for_attribute()
indices = search_map.keys()
if indices == {IndexType.KEYWORD}:
return SearchType(
name="KeywordField", args=f'"{search_map.get(IndexType.KEYWORD)}"'
)
elif indices == {IndexType.TEXT}:
return SearchType(name="TextField", args=f'"{search_map.get(IndexType.TEXT)}"')
elif indices == {IndexType.NUMERIC}:
return SearchType(
name="NumericField", args=f'"{search_map.get(IndexType.NUMERIC)}"'
)
elif indices == {IndexType.BOOLEAN}:
return SearchType(
name="BooleanField", args=f'"{search_map.get(IndexType.BOOLEAN)}"'
)
elif indices == {IndexType.NUMERIC, IndexType.RANK_FEATURE}:
return SearchType(
name="NumericRankField",
args=f'"{search_map.get(IndexType.NUMERIC)}", '
f'"{search_map.get(IndexType.RANK_FEATURE)}"',
)
elif indices == {IndexType.KEYWORD, IndexType.TEXT}:
return SearchType(
name="KeywordTextField",
args=f'"{search_map.get(IndexType.KEYWORD)}", '
f'"{search_map.get(IndexType.TEXT)}"',
)
elif indices == {IndexType.KEYWORD, IndexType.TEXT, IndexType.STEMMED}:
return SearchType(
name="KeywordTextStemmedField",
args=f'"{search_map.get(IndexType.KEYWORD)}", '
f'"{search_map.get(IndexType.TEXT)}", '
f'"{search_map.get(IndexType.STEMMED)}"',
)
return SearchType(name="RelationField")


class Generator:
def __init__(self) -> None:
self.environment = Environment(
loader=PackageLoader("pyatlan.generator", "templates")
)
self.environment.filters["to_snake_case"] = to_snake_case
self.environment.filters["get_type"] = get_type
self.environment.filters["get_search_type"] = get_search_type
self.environment.filters["get_class_var_for_attr"] = get_class_var_for_attr

def merge_attributes(self, entity_def):
def merge_them(s, a):
Expand Down
10 changes: 10 additions & 0 deletions pyatlan/generator/templates/imports.jinja2
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,16 @@ from pyatlan.model.enums import (
SchemaRegistrySchemaType,
SourceCostUnitType,
)
from pyatlan.model.fields.atlan_fields import (
BooleanField,
KeywordField,
KeywordTextField,
KeywordTextStemmedField,
NumericField,
NumericRankField,
RelationField,
TextField,
)
from pyatlan.model.internal import AtlasServer, Internal
from pyatlan.model.structs import (
AuthPolicyCondition,
Expand Down
23 changes: 22 additions & 1 deletion pyatlan/generator/templates/macros.jinja2
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{%- macro gen_properties(attribute_defs) %}
_convience_properties: ClassVar[list[str]] = [
_convenience_properties: ClassVar[list[str]] = [
{%- for attribute_def in attribute_defs %}
"{{ 'assigned_terms' if attribute_def.name == 'meanings' else attribute_def.name | to_snake_case }}",
{%- endfor %}]
Expand All @@ -21,3 +21,24 @@

{%- endfor %}
{% endmacro %}

{%- macro gen_property_class_vars(type_name, attribute_defs) %}
{%- for attribute_def in attribute_defs %}{% if attribute_def.name != "inputs" and attribute_def.name != "outputs" %}
{%- set search_type = attribute_def | get_search_type %}
{{ attribute_def.name | get_class_var_for_attr }}: ClassVar[{{ search_type.name }}] = {{ search_type.name }}("{{ attribute_def.name }}"{% if search_type.args %}, {{ search_type.args }}{% endif %})
"""
{{ attribute_def.description | default("TBC") }}
"""

{%- endif %}{%- endfor %}
{% endmacro %}

{%- macro gen_property_relationship_class_vars(type_name, relationship_attribute_defs) %}
{%- for attribute_def in relationship_attribute_defs %}{% if (type_name != "AtlasGlossaryTerm" or attribute_def.name != "categories") and (type_name != "AtlasGlossaryCategory" or attribute_def.name != "parentCategory") and ((type_name != "AtlasGlossaryTerm" or type_name != "AtlasGlossaryCategory") and attribute_def.name != "anchor") and (type_name != "Asset" or attribute_def.name != "meanings") %}
{{ attribute_def.name | get_class_var_for_attr }}: ClassVar[RelationField] = RelationField("{{ attribute_def.name }}")
"""
{{ attribute_def.description | default("TBC") }}
"""

{%- endif %}{%- endfor %}
{% endmacro %}
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,9 @@
qualified_name=qualified_name, name=name, anchor=glossary
)
)

ANCHOR: ClassVar[KeywordField] = KeywordField("anchor", "__glossary")
"""Glossary in which the category is contained, searchable by the qualifiedName of the glossary."""

PARENT_CATEGORY: ClassVar[KeywordField] = KeywordField("categories", "__parentCategory")
"""Parent category in which a subcategory is contained, searchable by the qualifiedName of the category."""
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,9 @@
qualified_name=qualified_name, name=name, anchor=glossary
)
)

ANCHOR: ClassVar[KeywordField] = KeywordField("anchor", "__glossary")
"""Glossary in which the term is contained, searchable by the qualifiedName of the glossary."""

CATEGORIES: ClassVar[KeywordField] = KeywordField("categories", "__categories")
"""Categories in which the term is organized, searchable by the qualifiedName of the category."""
Loading

0 comments on commit d7c28de

Please sign in to comment.