-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
AirbyteLib: add SQLCaches for DuckDB and Postgres (includes Ruff+Mypy…
… cleanup) (#33607)
- Loading branch information
1 parent
f637e11
commit 99a23dc
Showing
49 changed files
with
4,925 additions
and
371 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,15 @@ | ||
from airbyte_lib._factories.cache_factories import get_default_cache, new_local_cache | ||
from airbyte_lib._factories.connector_factories import get_connector | ||
from airbyte_lib.datasets import CachedDataset | ||
from airbyte_lib.results import ReadResult | ||
from airbyte_lib.source import Source | ||
|
||
from .factories import (get_connector, get_in_memory_cache) | ||
from .sync_result import (Dataset, SyncResult) | ||
from .source import (Source) | ||
|
||
__all__ = [ | ||
"get_connector", | ||
"get_in_memory_cache", | ||
"Dataset", | ||
"SyncResult", | ||
"get_default_cache", | ||
"new_local_cache", | ||
"CachedDataset", | ||
"ReadResult", | ||
"Source", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# Copyright (c) 2023 Airbyte, Inc., all rights reserved. | ||
|
||
|
||
from pathlib import Path | ||
|
||
import ulid | ||
|
||
from airbyte_lib.caches.duckdb import DuckDBCache, DuckDBCacheConfig | ||
|
||
|
||
def get_default_cache() -> DuckDBCache: | ||
"""Get a local cache for storing data, using the default database path. | ||
Cache files are stored in the `.cache` directory, relative to the current | ||
working directory. | ||
""" | ||
config = DuckDBCacheConfig( | ||
db_path="./.cache/default_cache_db.duckdb", | ||
) | ||
return DuckDBCache(config=config) | ||
|
||
|
||
def new_local_cache( | ||
cache_name: str | None = None, | ||
cache_dir: str | Path | None = None, | ||
cleanup: bool = True, | ||
) -> DuckDBCache: | ||
"""Get a local cache for storing data, using a name string to seed the path. | ||
Args: | ||
cache_name: Name to use for the cache. Defaults to None. | ||
root_dir: Root directory to store the cache in. Defaults to None. | ||
cleanup: Whether to clean up temporary files. Defaults to True. | ||
Cache files are stored in the `.cache` directory, relative to the current | ||
working directory. | ||
""" | ||
if cache_name: | ||
if " " in cache_name: | ||
raise ValueError(f"Cache name '{cache_name}' cannot contain spaces") | ||
|
||
if not cache_name.replace("_", "").isalnum(): | ||
raise ValueError( | ||
f"Cache name '{cache_name}' can only contain alphanumeric " | ||
"characters and underscores." | ||
) | ||
|
||
cache_name = cache_name or str(ulid.ULID()) | ||
cache_dir = cache_dir or Path(f"./.cache/{cache_name}") | ||
if not isinstance(cache_dir, Path): | ||
cache_dir = Path(cache_dir) | ||
|
||
config = DuckDBCacheConfig( | ||
db_path=cache_dir / f"db_{cache_name}.duckdb", | ||
cache_dir=cache_dir, | ||
cleanup=cleanup, | ||
) | ||
return DuckDBCache(config=config) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from .base import FileWriterBase, FileWriterBatchHandle, FileWriterConfigBase | ||
from .parquet import ParquetWriter, ParquetWriterConfig | ||
|
||
|
||
__all__ = [ | ||
"FileWriterBatchHandle", | ||
"FileWriterBase", | ||
"FileWriterConfigBase", | ||
"ParquetWriter", | ||
"ParquetWriterConfig", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# Copyright (c) 2023 Airbyte, Inc., all rights reserved. | ||
|
||
"""Define abstract base class for File Writers, which write and read from file storage.""" | ||
|
||
from __future__ import annotations | ||
|
||
import abc | ||
from dataclasses import dataclass, field | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING, cast, final | ||
|
||
from overrides import overrides | ||
|
||
from airbyte_lib._processors import BatchHandle, RecordProcessor | ||
from airbyte_lib.config import CacheConfigBase | ||
|
||
|
||
if TYPE_CHECKING: | ||
import pyarrow as pa | ||
|
||
|
||
DEFAULT_BATCH_SIZE = 10000 | ||
|
||
|
||
# The batch handle for file writers is a list of Path objects. | ||
@dataclass | ||
class FileWriterBatchHandle(BatchHandle): | ||
"""The file writer batch handle is a list of Path objects.""" | ||
|
||
files: list[Path] = field(default_factory=list) | ||
|
||
|
||
class FileWriterConfigBase(CacheConfigBase): | ||
"""Configuration for the Snowflake cache.""" | ||
|
||
cache_dir: Path = Path("./.cache/files/") | ||
"""The directory to store cache files in.""" | ||
cleanup: bool = True | ||
"""Whether to clean up temporary files after processing a batch.""" | ||
|
||
|
||
class FileWriterBase(RecordProcessor, abc.ABC): | ||
"""A generic base implementation for a file-based cache.""" | ||
|
||
config_class = FileWriterConfigBase | ||
config: FileWriterConfigBase | ||
|
||
@abc.abstractmethod | ||
@overrides | ||
def _write_batch( | ||
self, | ||
stream_name: str, | ||
batch_id: str, | ||
record_batch: pa.Table | pa.RecordBatch, | ||
) -> FileWriterBatchHandle: | ||
""" | ||
Process a record batch. | ||
Return a list of paths to one or more cache files. | ||
""" | ||
... | ||
|
||
@final | ||
def write_batch( | ||
self, | ||
stream_name: str, | ||
batch_id: str, | ||
record_batch: pa.Table | pa.RecordBatch, | ||
) -> FileWriterBatchHandle: | ||
"""Write a batch of records to the cache. | ||
This method is final because it should not be overridden. | ||
Subclasses should override `_write_batch` instead. | ||
""" | ||
return self._write_batch(stream_name, batch_id, record_batch) | ||
|
||
@overrides | ||
def _cleanup_batch( | ||
self, | ||
stream_name: str, | ||
batch_id: str, | ||
batch_handle: BatchHandle, | ||
) -> None: | ||
"""Clean up the cache. | ||
For file writers, this means deleting the files created and declared in the batch. | ||
This method is a no-op if the `cleanup` config option is set to False. | ||
""" | ||
if self.config.cleanup: | ||
batch_handle = cast(FileWriterBatchHandle, batch_handle) | ||
_ = stream_name, batch_id | ||
for file_path in batch_handle.files: | ||
file_path.unlink() | ||
|
||
@final | ||
def cleanup_batch( | ||
self, | ||
stream_name: str, | ||
batch_id: str, | ||
batch_handle: BatchHandle, | ||
) -> None: | ||
"""Clean up the cache. | ||
For file writers, this means deleting the files created and declared in the batch. | ||
This method is final because it should not be overridden. | ||
Subclasses should override `_cleanup_batch` instead. | ||
""" | ||
self._cleanup_batch(stream_name, batch_id, batch_handle) |
Oops, something went wrong.