Skip to content

Commit

Permalink
Autodetector for ISO date strings (#767)
Browse files Browse the repository at this point in the history
* Feat: iso_date auto-detection function

* Feat: tests for iso_date auto-detection

* Feat: make iso_date autodetection a default behavior

* NB: update docs

* fixup! Feat: tests for iso_date auto-detection

* NB: Fix makefile

* Fix: linting

* Fix: support dates with reduced precision

* fixup! Fix: support dates with reduced precision

* Fix: don't make iso-date a default auto-detector

---------

Co-authored-by: Skynet <robo@taktile.com>
  • Loading branch information
codingcyclist and Skynet authored Nov 17, 2023
1 parent f72c331 commit e970741
Show file tree
Hide file tree
Showing 10 changed files with 64 additions and 7 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ help:
@echo " runs flake and mypy"
@echo " test"
@echo " tests all the components including destinations"
@echo " test-local"
@echo " test-load-local"
@echo " tests all components unsing local destinations: duckdb and postgres"
@echo " test-common"
@echo " tests common components"
Expand Down
19 changes: 19 additions & 0 deletions dlt/common/schema/detections.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,25 @@ def is_iso_timestamp(t: Type[Any], v: Any) -> Optional[TDataType]:
return None


def is_iso_date(t: Type[Any], v: Any) -> Optional[TDataType]:
# only strings can be converted
if not issubclass(t, str):
return None
if not v:
return None
# don't cast iso timestamps as dates
if is_iso_timestamp(t,v):
return None
# strict autodetection of iso timestamps
try:
dtv = parse_iso_like_datetime(v)
if isinstance(dtv, datetime.date):
return "date"
except Exception:
pass
return None


def is_large_integer(t: Type[Any], v: Any) -> Optional[TDataType]:
# only ints can be converted
if issubclass(t, int):
Expand Down
2 changes: 1 addition & 1 deletion dlt/common/schema/typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"""Known hints of a column used to declare hint regexes."""
TWriteDisposition = Literal["skip", "append", "replace", "merge"]
TTableFormat = Literal["iceberg"]
TTypeDetections = Literal["timestamp", "iso_timestamp", "large_integer", "hexbytes_to_text", "wei_to_double"]
TTypeDetections = Literal["timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double"]
TTypeDetectionFunc = Callable[[Type[Any], Any], Optional[TDataType]]
TColumnNames = Union[str, Sequence[str]]
"""A string representing a column name or a list of"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ settings:
detections:
- timestamp
- iso_timestamp
- iso_date
default_hints:
not_null:
- _dlt_id
Expand Down
1 change: 1 addition & 0 deletions docs/examples/archive/schemas/dlt_quickstart.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ normalizers:
detections:
- timestamp
- iso_timestamp
- iso_date
names: dlt.common.normalizers.names.snake_case
json:
module: dlt.common.normalizers.json.relational
1 change: 1 addition & 0 deletions docs/technical/working_with_schemas.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ settings:
detections:
- timestamp
- iso_timestamp
- iso_date
```

⛔ we may define `all_text` function that will generate string only schemas by telling `dlt` that all types should be coerced to strings.
Expand Down
1 change: 1 addition & 0 deletions docs/website/docs/general-usage/schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ settings:
detections:
- timestamp
- iso_timestamp
- iso_date
```
### Column hint rules
Expand Down
5 changes: 3 additions & 2 deletions tests/common/cases/schemas/github/issues.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -1294,7 +1294,8 @@
"settings": {
"detections": [
"timestamp",
"iso_timestamp"
"iso_timestamp",
"iso_date"
],
"default_hints": {
"not_null": [
Expand All @@ -1318,4 +1319,4 @@
"module": "dlt.common.normalizers.json.relational"
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -387,11 +387,12 @@
"normalizers": {
"detections": [
"timestamp",
"iso_timestamp"
"iso_timestamp",
"iso_date"
],
"names": "dlt.common.normalizers.names.snake_case",
"json": {
"module": "dlt.common.normalizers.json.relational"
}
}
}
}
34 changes: 33 additions & 1 deletion tests/common/schema/test_detections.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from dlt.common import pendulum, Decimal, Wei
from dlt.common.schema.utils import autodetect_sc_type
from dlt.common.schema.detections import is_hexbytes_to_text, is_timestamp, is_iso_timestamp, is_large_integer, is_wei_to_double, _FLOAT_TS_RANGE, _NOW_TS
from dlt.common.schema.detections import is_hexbytes_to_text, is_timestamp, is_iso_timestamp, is_iso_date, is_large_integer, is_wei_to_double, _FLOAT_TS_RANGE, _NOW_TS


def test_timestamp_detection() -> None:
Expand Down Expand Up @@ -34,6 +34,36 @@ def test_iso_timestamp_detection() -> None:
assert is_iso_timestamp(float, str(pendulum.now())) is None


def test_iso_date_detection() -> None:
assert is_iso_date(str, str(pendulum.now().date())) == "date"
assert is_iso_date(str, "1975-05-21") == "date"
assert is_iso_date(str, "19750521") == "date"

# ISO-8601 allows dates with reduced precision
assert is_iso_date(str, "1975-05") == "date"
assert is_iso_date(str, "1975") == "date"

# dont auto-detect timestamps as dates
assert is_iso_date(str, str(pendulum.now())) is None
assert is_iso_date(str, "1975-05-21T22:00:00Z") is None
assert is_iso_date(str, "2022-06-01T00:48:35.040Z") is None
assert is_iso_date(str, "1975-0521T22:00:00Z") is None
assert is_iso_date(str, "2021-07-24 10:51") is None

# times are not accepted
assert is_iso_date(str, "22:00:00") is None
# wrong formats
assert is_iso_date(str, "197505") is None
assert is_iso_date(str, "0-05-01") is None
assert is_iso_date(str, "") is None
assert is_iso_date(str, "75") is None
assert is_iso_date(str, "01-12") is None
assert is_iso_date(str, "1975/05/01") is None

# wrong type
assert is_iso_date(float, str(pendulum.now().date())) is None


def test_detection_large_integer() -> None:
assert is_large_integer(str, "A") is None
assert is_large_integer(int, 2**64 // 2) == "wei"
Expand All @@ -56,6 +86,8 @@ def test_detection_function() -> None:
assert autodetect_sc_type(None, str, str(pendulum.now())) is None
assert autodetect_sc_type(["iso_timestamp"], str, str(pendulum.now())) == "timestamp"
assert autodetect_sc_type(["iso_timestamp"], float, str(pendulum.now())) is None
assert autodetect_sc_type(["iso_date"], str, str(pendulum.now().date())) == "date"
assert autodetect_sc_type(["iso_date"], float, str(pendulum.now().date())) is None
assert autodetect_sc_type(["timestamp"], str, str(pendulum.now())) is None
assert autodetect_sc_type(["timestamp", "iso_timestamp"], float, pendulum.now().timestamp()) == "timestamp"
assert autodetect_sc_type(["timestamp", "large_integer"], int, 2**64) == "wei"
Expand Down

0 comments on commit e970741

Please sign in to comment.