-
Notifications
You must be signed in to change notification settings - Fork 203
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
source and schema changes #769
Changes from 4 commits
33a64fb
be2d64e
951b27a
7cb566e
c8e6e25
0799197
b68013d
cb251e9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -134,7 +134,7 @@ def add_column_defaults(column: TColumnSchemaBase) -> TColumnSchema: | |
# return copy(column) # type: ignore | ||
|
||
|
||
def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str]: | ||
def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, List[str]]: | ||
# if any change to schema document is detected then bump version and write new hash | ||
hash_ = generate_version_hash(stored_schema) | ||
previous_hash = stored_schema.get("version_hash") | ||
|
@@ -143,8 +143,13 @@ def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str]: | |
pass | ||
elif hash_ != previous_hash: | ||
stored_schema["version"] += 1 | ||
# unshift previous hash to ancestors and limit array to 10 entries | ||
if previous_hash not in stored_schema["ancestors"]: | ||
stored_schema["ancestors"].insert(0, previous_hash) | ||
stored_schema["ancestors"] = stored_schema["ancestors"][:10] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we could have it in the contants so it is easy to find and change? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure what you mean? |
||
|
||
stored_schema["version_hash"] = hash_ | ||
return stored_schema["version"], hash_ | ||
return stored_schema["version"], hash_, stored_schema["ancestors"] | ||
|
||
|
||
def generate_version_hash(stored_schema: TStoredSchema) -> str: | ||
|
@@ -153,6 +158,7 @@ def generate_version_hash(stored_schema: TStoredSchema) -> str: | |
schema_copy.pop("version") | ||
schema_copy.pop("version_hash", None) | ||
schema_copy.pop("imported_version_hash", None) | ||
schema_copy.pop("ancestors", None) | ||
# ignore order of elements when computing the hash | ||
content = json.dumps(schema_copy, sort_keys=True) | ||
h = hashlib.sha3_256(content.encode("utf-8")) | ||
|
@@ -240,12 +246,18 @@ def compile_simple_regexes(r: Iterable[TSimpleRegex]) -> REPattern: | |
|
||
|
||
def validate_stored_schema(stored_schema: TStoredSchema) -> None: | ||
# exclude validation of keys added later | ||
ignored_keys = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why this is required? first we migrate the dictionary and then we validate the schema. so the engine should be always 7 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in test_upgrade_engine_v1_schema many different schema versions are validated. we could alternatively change that test. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. really I was sure we migrate and then validate. We should validate only the version after migration. So you can change the test. but you can keep this |
||
if stored_schema["engine_version"] < 7: | ||
ignored_keys.append("ancestors") | ||
|
||
# use lambda to verify only non extra fields | ||
validate_dict_ignoring_xkeys( | ||
spec=TStoredSchema, | ||
doc=stored_schema, | ||
path=".", | ||
validator_f=simple_regex_validator | ||
validator_f=simple_regex_validator, | ||
filter_required=lambda k: k not in ignored_keys | ||
) | ||
# check child parent relationships | ||
for table_name, table in stored_schema["tables"].items(): | ||
|
@@ -256,6 +268,7 @@ def validate_stored_schema(stored_schema: TStoredSchema) -> None: | |
|
||
|
||
def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) -> TStoredSchema: | ||
|
||
if from_engine == to_engine: | ||
return cast(TStoredSchema, schema_dict) | ||
|
||
|
@@ -340,6 +353,9 @@ def migrate_filters(group: str, filters: List[str]) -> None: | |
# replace loads table | ||
schema_dict["tables"][LOADS_TABLE_NAME] = load_table() | ||
from_engine = 6 | ||
if from_engine == 6 and to_engine > 6: | ||
schema_dict["ancestors"] = [] | ||
from_engine = 7 | ||
|
||
schema_dict["engine_version"] = from_engine | ||
if from_engine != to_engine: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ | |
TCustomValidator = Callable[[str, str, Any, Any], bool] | ||
|
||
|
||
def validate_dict(spec: Type[_TypedDict], doc: StrAny, path: str, filter_f: TFilterFunc = None, validator_f: TCustomValidator = None) -> None: | ||
def validate_dict(spec: Type[_TypedDict], doc: StrAny, path: str, filter_f: TFilterFunc = None, validator_f: TCustomValidator = None, filter_required: TFilterFunc = None) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. filter_required - what it does? you added new argument without docstrings There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe rename to on the other hand I do not want Pydantic in core deps |
||
"""Validate the `doc` dictionary based on the given typed dictionary specification `spec`. | ||
|
||
Args: | ||
|
@@ -32,11 +32,12 @@ def validate_dict(spec: Type[_TypedDict], doc: StrAny, path: str, filter_f: TFil | |
""" | ||
# pass through filter | ||
filter_f = filter_f or (lambda _: True) | ||
filter_required = filter_required or (lambda _: True) | ||
# cannot validate anything | ||
validator_f = validator_f or (lambda p, pk, pv, t: False) | ||
|
||
allowed_props = get_type_hints(spec) | ||
required_props = {k: v for k, v in allowed_props.items() if not is_optional_type(v)} | ||
required_props = {k: v for k, v in allowed_props.items() if (not is_optional_type(v) and filter_required(k))} | ||
# remove optional props | ||
props = {k: v for k, v in doc.items() if filter_f(k)} | ||
# check missing props | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -145,9 +145,6 @@ def decorator(f: Callable[TSourceFunParams, Any]) -> Callable[TSourceFunParams, | |
if name and name != schema.name: | ||
raise ExplicitSourceNameInvalid(name, schema.name) | ||
|
||
# the name of the source must be identical to the name of the schema | ||
name = schema.name | ||
|
||
# wrap source extraction function in configuration with section | ||
func_module = inspect.getmodule(f) | ||
source_section = section or _get_source_section_name(func_module) | ||
|
@@ -162,16 +159,16 @@ def _wrap(*args: Any, **kwargs: Any) -> TDltSourceImpl: | |
# configurations will be accessed in this section in the source | ||
proxy = Container()[PipelineContext] | ||
pipeline_name = None if not proxy.is_active() else proxy.pipeline().pipeline_name | ||
with inject_section(ConfigSectionContext(pipeline_name=pipeline_name, sections=source_sections, source_state_key=name)): | ||
with inject_section(ConfigSectionContext(pipeline_name=pipeline_name, sections=source_sections, source_state_key=schema.name)): | ||
rv = conf_f(*args, **kwargs) | ||
if rv is None: | ||
raise SourceDataIsNone(name) | ||
raise SourceDataIsNone(schema.name) | ||
# if generator, consume it immediately | ||
if inspect.isgenerator(rv): | ||
rv = list(rv) | ||
|
||
# convert to source | ||
s = _impl_cls.from_data(name, source_section, schema.clone(update_normalizers=True), rv) | ||
s = _impl_cls.from_data(source_section, schema.clone(update_normalizers=True), rv) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
# apply hints | ||
if max_table_nesting is not None: | ||
s.max_table_nesting = max_table_nesting | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -642,22 +642,17 @@ class DltSource(Iterable[TDataItem]): | |
* You can use a `run` method to load the data with a default instance of dlt pipeline. | ||
* You can get source read only state for the currently active Pipeline instance | ||
""" | ||
def __init__(self, name: str, section: str, schema: Schema, resources: Sequence[DltResource] = None) -> None: | ||
self.name = name | ||
def __init__(self, section: str, schema: Schema, resources: Sequence[DltResource] = None) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could we swap schema with section? it feels more natural - schema is the most important arg There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
self.section = section | ||
"""Tells if iterator associated with a source is exhausted""" | ||
self._schema = schema | ||
self._resources: DltResourceDict = DltResourceDict(self.name, self.section) | ||
|
||
if self.name != schema.name: | ||
# raise ValueError(f"Schema name {schema.name} differs from source name {name}! The explicit source name argument is deprecated and will be soon removed.") | ||
warnings.warn(f"Schema name {schema.name} differs from source name {name}! The explicit source name argument is deprecated and will be soon removed.") | ||
|
||
if resources: | ||
self.resources.add(*resources) | ||
|
||
@classmethod | ||
def from_data(cls, name: str, section: str, schema: Schema, data: Any) -> Self: | ||
def from_data(cls, section: str, schema: Schema, data: Any) -> Self: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. also here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
"""Converts any `data` supported by `dlt` `run` method into `dlt source` with a name `section`.`name` and `schema` schema.""" | ||
# creates source from various forms of data | ||
if isinstance(data, DltSource): | ||
|
@@ -669,10 +664,14 @@ def from_data(cls, name: str, section: str, schema: Schema, data: Any) -> Self: | |
else: | ||
resources = [DltResource.from_data(data)] | ||
|
||
return cls(name, section, schema, resources) | ||
return cls(section, schema, resources) | ||
|
||
@property | ||
def name(self) -> str: | ||
return self._schema.name | ||
|
||
# TODO: 4 properties below must go somewhere else ie. into RelationalSchema which is Schema + Relational normalizer. | ||
|
||
# TODO: 4 properties below must go somewhere else ie. into RelationalSchema which is Schema + Relational normalizer. | ||
@property | ||
def max_table_nesting(self) -> int: | ||
"""A schema hint that sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON.""" | ||
|
@@ -795,7 +794,7 @@ def state(self) -> StrAny: | |
def clone(self) -> "DltSource": | ||
"""Creates a deep copy of the source where copies of schema, resources and pipes are created""" | ||
# mind that resources and pipes are cloned when added to the DltResourcesDict in the source constructor | ||
return DltSource(self.name, self.section, self.schema.clone(), list(self._resources.values())) | ||
return DltSource(self.section, self.schema.clone(), list(self._resources.values())) | ||
|
||
def __iter__(self) -> Iterator[TDataItem]: | ||
"""Opens iterator that yields the data items from all the resources within the source in the same order as in Pipeline class. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is not ancestor. this is previous version hash. maybe let's name it like that. we do not have any schema derivation scheme like we have in pydantic models (ie. base schema etc.) this is just a version (probably revision would be better - but it is too late to change it)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
_previous_version_hashes or _previous_hashes would be better
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done!