Skip to content

Commit

Permalink
Fix: Resolve issues in Postgres name normalization when names are >63…
Browse files Browse the repository at this point in the history
… characters (#359)
  • Loading branch information
aaronsteers authored Sep 9, 2024
1 parent 826d689 commit ddbf6f4
Show file tree
Hide file tree
Showing 15 changed files with 173 additions and 74 deletions.
2 changes: 1 addition & 1 deletion airbyte/_executors/python.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from typing import TYPE_CHECKING, Literal

from overrides import overrides
from rich import print
from rich import print # noqa: A004 # Allow shadowing the built-in

from airbyte import exceptions as exc
from airbyte._executors.base import Executor
Expand Down
2 changes: 1 addition & 1 deletion airbyte/_executors/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import requests
import yaml
from requests import HTTPError
from rich import print
from rich import print # noqa: A004 # Allow shadowing the built-in

from airbyte import exceptions as exc
from airbyte._executors.declarative import DeclarativeExecutor
Expand Down
24 changes: 24 additions & 0 deletions airbyte/_processors/sql/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@

from __future__ import annotations

import functools

from overrides import overrides

from airbyte._util.name_normalizers import LowerCaseNormalizer
from airbyte._writers.jsonl import JsonlWriter
from airbyte.secrets.base import SecretString
from airbyte.shared.sql_processor import SqlConfig, SqlProcessorBase
Expand Down Expand Up @@ -35,6 +38,24 @@ def get_database_name(self) -> str:
return self.database


class PostgresNormalizer(LowerCaseNormalizer):
"""A name normalizer for Postgres.
Postgres has specific field name length limits:
- Tables names are limited to 63 characters.
- Column names are limited to 63 characters.
The postgres normalizer inherits from the default LowerCaseNormalizer class, and
additionally truncates column and table names to 63 characters.
"""

@staticmethod
@functools.cache
def normalize(name: str) -> str:
"""Normalize the name, truncating to 63 characters."""
return LowerCaseNormalizer.normalize(name)[:63]


class PostgresSqlProcessor(SqlProcessorBase):
"""A Postgres implementation of the cache.
Expand All @@ -49,3 +70,6 @@ class PostgresSqlProcessor(SqlProcessorBase):
supports_merge_insert = False
file_writer_class = JsonlWriter
sql_config: PostgresConfig

normalizer = PostgresNormalizer
"""A Postgres-specific name normalizer for table and column name normalization."""
2 changes: 1 addition & 1 deletion airbyte/_util/temp_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def as_temp_files(files_contents: list[dict | str]) -> Generator[list[str], Any,
try:
for content in files_contents:
use_json = isinstance(content, dict)
temp_file = tempfile.NamedTemporaryFile(
temp_file = tempfile.NamedTemporaryFile( # noqa: SIM115 # Avoiding context manager
mode="w+t",
delete=False,
encoding="utf-8",
Expand Down
2 changes: 1 addition & 1 deletion airbyte/_writers/jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def _open_new_file(
"""Open a new file for writing."""
return cast(
IO[str],
gzip.open(
gzip.open( # noqa: SIM115 # Avoiding context manager
file_path,
mode="wt",
encoding="utf-8",
Expand Down
8 changes: 4 additions & 4 deletions airbyte/cloud/workspaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,10 @@ def _deploy_connection(
source_id: str
if isinstance(source, Source):
selected_streams = selected_streams or source.get_selected_streams()
if source._deployed_source_id: # noqa: SLF001
source_id = source._deployed_source_id # noqa: SLF001
else:
source_id = self._deploy_source(source)
source_id = (
source._deployed_source_id # noqa: SLF001 # Access to non-public API
or self._deploy_source(source)
)
else:
source_id = source
if not selected_streams:
Expand Down
2 changes: 1 addition & 1 deletion airbyte/secrets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,6 @@ def disable_secret_source(source: SecretManager | SecretSourceEnum) -> None:
return

# Else, remove by name
for s in _SECRETS_SOURCES:
for s in list(_SECRETS_SOURCES).copy():
if s.name == str(source):
_SECRETS_SOURCES.remove(s)
16 changes: 14 additions & 2 deletions airbyte/shared/sql_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,8 +497,20 @@ def _get_temp_table_name(
batch_id: str | None = None, # ULID of the batch
) -> str:
"""Return a new (unique) temporary table name."""
batch_id = batch_id or str(ulid.ULID())
return self.normalizer.normalize(f"{stream_name}_{batch_id}")
if not batch_id:
batch_id = str(ulid.ULID())

# Use the first 6 and last 3 characters of the ULID. This gives great uniqueness while
# limiting the table name suffix to 10 characters, including the underscore.
suffix = (
f"{batch_id[:6]}{batch_id[-3:]}"
if len(batch_id) > 9 # noqa: PLR2004 # Allow magic int value
else batch_id
)

# Note: The normalizer may truncate the table name if the database has a name length limit.
# For instance, the Postgres normalizer will enforce a 63-character limit on table names.
return self.normalizer.normalize(f"{stream_name}_{suffix}")

def _fully_qualified(
self,
Expand Down
24 changes: 20 additions & 4 deletions airbyte/sources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from typing import TYPE_CHECKING, Any, Literal

import yaml
from rich import print
from rich import print # noqa: A004 # Allow shadowing the built-in
from rich.syntax import Syntax

from airbyte_protocol.models import (
Expand Down Expand Up @@ -405,9 +405,25 @@ def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]:

return found[0].json_schema

def get_records(self, stream: str) -> LazyDataset:
def get_records(
self,
stream: str,
*,
normalize_field_names: bool = False,
prune_undeclared_fields: bool = True,
) -> LazyDataset:
"""Read a stream from the connector.
Args:
stream: The name of the stream to read.
normalize_field_names: When `True`, field names will be normalized to lower case, with
special characters removed. This matches the behavior of PyAirbyte caches and most
Airbyte destinations.
prune_undeclared_fields: When `True`, undeclared fields will be pruned from the records,
which generally matches the behavior of PyAirbyte caches and most Airbyte
destinations, specifically when you expect the catalog may be stale. You can disable
this to keep all fields in the records.
This involves the following steps:
* Call discover to get the catalog
* Generate a configured catalog that syncs the given stream in full_refresh mode
Expand Down Expand Up @@ -445,8 +461,8 @@ def _with_logging(records: Iterable[dict[str, Any]]) -> Iterator[dict[str, Any]]

stream_record_handler = StreamRecordHandler(
json_schema=self.get_stream_json_schema(stream),
prune_extra_fields=True,
normalize_keys=False,
prune_extra_fields=prune_undeclared_fields,
normalize_keys=normalize_field_names,
)

# This method is non-blocking, so we use "PLAIN" to avoid a live progress display
Expand Down
3 changes: 2 additions & 1 deletion airbyte/types.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# noqa: A005 # Allow shadowing the built-in 'types' module
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.

"""Type conversion methods for SQL Caches."""
Expand All @@ -7,7 +8,7 @@
from typing import cast

import sqlalchemy
from rich import print
from rich import print # noqa: A004 # Allow shadowing the built-in


# Compare to documentation here: https://docs.airbyte.com/understanding-airbyte/supported-data-types
Expand Down
2 changes: 1 addition & 1 deletion airbyte/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from pathlib import Path

import yaml
from rich import print
from rich import print # noqa: A004 # Allow shadowing the built-in

import airbyte as ab
from airbyte import exceptions as exc
Expand Down
39 changes: 20 additions & 19 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ pdoc = "^14.3.0"
pytest = "^8.2.0"
pytest-docker = "^3.1.1"
pytest-mypy = "^0.10.3"
ruff = "0.4.1"
ruff = "^0.6.4"
types-jsonschema = "^4.20.0.0"
types-requests = "2.31.0.4"
freezegun = "^1.4.0"
Expand Down Expand Up @@ -188,7 +188,7 @@ ignore = [
"S", # flake8-bandit (noisy, security related)
"SIM910", # Allow "None" as second argument to Dict.get(). "Explicit is better than implicit."
"TD002", # Require author for TODOs
"TRIO", # flake8-trio (opinionated, noisy)
"ASYNC1", # flake8-trio (opinionated, noisy)
"INP001", # Dir 'examples' is part of an implicit namespace package. Add an __init__.py.

# TODO: Consider re-enabling these before release:
Expand Down
Loading

0 comments on commit ddbf6f4

Please sign in to comment.