Fix: Resolve issues in Postgres name normalization when names are >63…

… characters (#359)
airbytehq · Sep 9, 2024 · ddbf6f4 · ddbf6f4
1 parent 826d689
commit ddbf6f4
Show file tree

Hide file tree

Showing 15 changed files with 173 additions and 74 deletions.
diff --git a/airbyte/_executors/python.py b/airbyte/_executors/python.py
@@ -10,7 +10,7 @@
 from typing import TYPE_CHECKING, Literal
 
 from overrides import overrides
-from rich import print
+from rich import print # noqa: A004 # Allow shadowing the built-in
 
 from airbyte import exceptions as exc
 from airbyte._executors.base import Executor

diff --git a/airbyte/_executors/util.py b/airbyte/_executors/util.py
@@ -8,7 +8,7 @@
 import requests
 import yaml
 from requests import HTTPError
-from rich import print
+from rich import print # noqa: A004 # Allow shadowing the built-in
 
 from airbyte import exceptions as exc
 from airbyte._executors.declarative import DeclarativeExecutor

diff --git a/airbyte/_processors/sql/postgres.py b/airbyte/_processors/sql/postgres.py
@@ -3,8 +3,11 @@
 
 from __future__ import annotations
 
+import functools
+
 from overrides import overrides
 
+from airbyte._util.name_normalizers import LowerCaseNormalizer
 from airbyte._writers.jsonl import JsonlWriter
 from airbyte.secrets.base import SecretString
 from airbyte.shared.sql_processor import SqlConfig, SqlProcessorBase
@@ -35,6 +38,24 @@ def get_database_name(self) -> str:
  return self.database
 
 
+class PostgresNormalizer(LowerCaseNormalizer):
+ """A name normalizer for Postgres.
+
+ Postgres has specific field name length limits:
+ - Tables names are limited to 63 characters.
+ - Column names are limited to 63 characters.
+
+ The postgres normalizer inherits from the default LowerCaseNormalizer class, and
+ additionally truncates column and table names to 63 characters.
+ """
+
+ @staticmethod
+ @functools.cache
+ def normalize(name: str) -> str:
+ """Normalize the name, truncating to 63 characters."""
+ return LowerCaseNormalizer.normalize(name)[:63]
+
+
 class PostgresSqlProcessor(SqlProcessorBase):
  """A Postgres implementation of the cache.
 
@@ -49,3 +70,6 @@ class PostgresSqlProcessor(SqlProcessorBase):
  supports_merge_insert = False
  file_writer_class = JsonlWriter
  sql_config: PostgresConfig
+
+ normalizer = PostgresNormalizer
+ """A Postgres-specific name normalizer for table and column name normalization."""
diff --git a/airbyte/_util/temp_files.py b/airbyte/_util/temp_files.py
@@ -23,7 +23,7 @@ def as_temp_files(files_contents: list[dict | str]) -> Generator[list[str], Any,
  try:
  for content in files_contents:
  use_json = isinstance(content, dict)
- temp_file = tempfile.NamedTemporaryFile(
+ temp_file = tempfile.NamedTemporaryFile( # noqa: SIM115 # Avoiding context manager
  mode="w+t",
  delete=False,
  encoding="utf-8",

diff --git a/airbyte/_writers/jsonl.py b/airbyte/_writers/jsonl.py
@@ -35,7 +35,7 @@ def _open_new_file(
  """Open a new file for writing."""
  return cast(
  IO[str],
- gzip.open(
+ gzip.open( # noqa: SIM115 # Avoiding context manager
  file_path,
  mode="wt",
  encoding="utf-8",

diff --git a/airbyte/cloud/workspaces.py b/airbyte/cloud/workspaces.py
@@ -215,10 +215,10 @@ def _deploy_connection(
  source_id: str
  if isinstance(source, Source):
  selected_streams = selected_streams or source.get_selected_streams()
- if source._deployed_source_id: # noqa: SLF001
- source_id = source._deployed_source_id # noqa: SLF001
- else:
-  source_id = self._deploy_source(source)
+ source_id = (
+ source._deployed_source_id # noqa: SLF001 # Access to non-public API
+  or self._deploy_source(source)
+ )
  else:
  source_id = source
  if not selected_streams:

diff --git a/airbyte/secrets/config.py b/airbyte/secrets/config.py
@@ -77,6 +77,6 @@ def disable_secret_source(source: SecretManager | SecretSourceEnum) -> None:
  return
 
  # Else, remove by name
- for s in _SECRETS_SOURCES:
+ for s in list(_SECRETS_SOURCES).copy():
  if s.name == str(source):
  _SECRETS_SOURCES.remove(s)
diff --git a/airbyte/shared/sql_processor.py b/airbyte/shared/sql_processor.py
@@ -497,8 +497,20 @@ def _get_temp_table_name(
  batch_id: str | None = None, # ULID of the batch
  ) -> str:
  """Return a new (unique) temporary table name."""
- batch_id = batch_id or str(ulid.ULID())
- return self.normalizer.normalize(f"{stream_name}_{batch_id}")
+ if not batch_id:
+ batch_id = str(ulid.ULID())
+
+ # Use the first 6 and last 3 characters of the ULID. This gives great uniqueness while
+ # limiting the table name suffix to 10 characters, including the underscore.
+ suffix = (
+ f"{batch_id[:6]}{batch_id[-3:]}"
+ if len(batch_id) > 9 # noqa: PLR2004 # Allow magic int value
+ else batch_id
+ )
+
+ # Note: The normalizer may truncate the table name if the database has a name length limit.
+ # For instance, the Postgres normalizer will enforce a 63-character limit on table names.
+ return self.normalizer.normalize(f"{stream_name}_{suffix}")
 
  def _fully_qualified(
  self,

diff --git a/airbyte/sources/base.py b/airbyte/sources/base.py
@@ -9,7 +9,7 @@
 from typing import TYPE_CHECKING, Any, Literal
 
 import yaml
-from rich import print
+from rich import print # noqa: A004 # Allow shadowing the built-in
 from rich.syntax import Syntax
 
 from airbyte_protocol.models import (
@@ -405,9 +405,25 @@ def get_stream_json_schema(self, stream_name: str) -> dict[str, Any]:
 
  return found[0].json_schema
 
- def get_records(self, stream: str) -> LazyDataset:
+ def get_records(
+ self,
+ stream: str,
+ *,
+ normalize_field_names: bool = False,
+ prune_undeclared_fields: bool = True,
+ ) -> LazyDataset:
  """Read a stream from the connector.
 
+ Args:
+ stream: The name of the stream to read.
+ normalize_field_names: When `True`, field names will be normalized to lower case, with
+ special characters removed. This matches the behavior of PyAirbyte caches and most
+ Airbyte destinations.
+ prune_undeclared_fields: When `True`, undeclared fields will be pruned from the records,
+ which generally matches the behavior of PyAirbyte caches and most Airbyte
+ destinations, specifically when you expect the catalog may be stale. You can disable
+ this to keep all fields in the records.
+
  This involves the following steps:
  * Call discover to get the catalog
  * Generate a configured catalog that syncs the given stream in full_refresh mode
@@ -445,8 +461,8 @@ def _with_logging(records: Iterable[dict[str, Any]]) -> Iterator[dict[str, Any]]
 
  stream_record_handler = StreamRecordHandler(
  json_schema=self.get_stream_json_schema(stream),
- prune_extra_fields=True,
- normalize_keys=False,
+ prune_extra_fields=prune_undeclared_fields,
+ normalize_keys=normalize_field_names,
  )
 
  # This method is non-blocking, so we use "PLAIN" to avoid a live progress display

diff --git a/airbyte/types.py b/airbyte/types.py
@@ -1,3 +1,4 @@
+# noqa: A005 # Allow shadowing the built-in 'types' module
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 
 """Type conversion methods for SQL Caches."""
@@ -7,7 +8,7 @@
 from typing import cast
 
 import sqlalchemy
-from rich import print
+from rich import print # noqa: A004 # Allow shadowing the built-in
 
 
 # Compare to documentation here: https://docs.airbyte.com/understanding-airbyte/supported-data-types

diff --git a/airbyte/validate.py b/airbyte/validate.py
@@ -15,7 +15,7 @@
 from pathlib import Path
 
 import yaml
-from rich import print
+from rich import print # noqa: A004 # Allow shadowing the built-in
 
 import airbyte as ab
 from airbyte import exceptions as exc

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,7 +59,7 @@ pdoc = "^14.3.0"
 pytest = "^8.2.0"
 pytest-docker = "^3.1.1"
 pytest-mypy = "^0.10.3"
-ruff = "0.4.1"
+ruff = "^0.6.4"
 types-jsonschema = "^4.20.0.0"
 types-requests = "2.31.0.4"
 freezegun = "^1.4.0"
@@ -188,7 +188,7 @@ ignore = [
  "S", # flake8-bandit (noisy, security related)
  "SIM910", # Allow "None" as second argument to Dict.get(). "Explicit is better than implicit."
  "TD002", # Require author for TODOs
- "TRIO",  # flake8-trio (opinionated, noisy)
+ "ASYNC1", # flake8-trio (opinionated, noisy)
  "INP001", # Dir 'examples' is part of an implicit namespace package. Add an __init__.py.
 
  # TODO: Consider re-enabling these before release: