Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CDK: Make consts required in Pydantic generated json schemas #32251

Merged
merged 4 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from typing import List, Literal, Optional, Union

from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
from pydantic import BaseModel, Field


Expand All @@ -16,11 +17,10 @@ class SeparatorSplitterConfigModel(BaseModel):
)
keep_separator: bool = Field(default=False, title="Keep separator", description="Whether to keep the separator in the resulting chunks")

class Config:
class Config(OneOfOptionConfig):
title = "By Separator"
schema_extra = {
"description": "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
}
description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
discriminator = "mode"


class MarkdownHeaderSplitterConfigModel(BaseModel):
Expand All @@ -33,11 +33,10 @@ class MarkdownHeaderSplitterConfigModel(BaseModel):
ge=1,
)

class Config:
class Config(OneOfOptionConfig):
title = "By Markdown header"
schema_extra = {
"description": "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
}
description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
discriminator = "mode"


class CodeSplitterConfigModel(BaseModel):
Expand Down Expand Up @@ -65,11 +64,12 @@ class CodeSplitterConfigModel(BaseModel):
],
)

class Config:
class Config(OneOfOptionConfig):
title = "By Programming Language"
schema_extra = {
"description": "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
}
description = (
"Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
)
discriminator = "mode"


TextSplitterConfigModel = Union[SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel]
Expand Down Expand Up @@ -128,11 +128,12 @@ class OpenAIEmbeddingConfigModel(BaseModel):
mode: Literal["openai"] = Field("openai", const=True)
openai_key: str = Field(..., title="OpenAI API key", airbyte_secret=True)

class Config:
class Config(OneOfOptionConfig):
title = "OpenAI"
schema_extra = {
"description": "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
}
description = (
"Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
)
discriminator = "mode"


class OpenAICompatibleEmbeddingConfigModel(BaseModel):
Expand All @@ -151,9 +152,10 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
)

class Config:
class Config(OneOfOptionConfig):
title = "OpenAI-compatible"
schema_extra = {"description": "Use a service that's compatible with the OpenAI API to embed text."}
description = "Use a service that's compatible with the OpenAI API to embed text."
discriminator = "mode"


class AzureOpenAIEmbeddingConfigModel(BaseModel):
Expand All @@ -177,21 +179,19 @@ class AzureOpenAIEmbeddingConfigModel(BaseModel):
examples=["your-resource-name"],
)

class Config:
class Config(OneOfOptionConfig):
title = "Azure OpenAI"
schema_extra = {
"description": "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
}
description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
discriminator = "mode"


class FakeEmbeddingConfigModel(BaseModel):
mode: Literal["fake"] = Field("fake", const=True)

class Config:
class Config(OneOfOptionConfig):
title = "Fake"
schema_extra = {
"description": "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
}
description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
discriminator = "mode"


class FromFieldEmbeddingConfigModel(BaseModel):
Expand All @@ -203,17 +203,17 @@ class FromFieldEmbeddingConfigModel(BaseModel):
..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
)

class Config:
class Config(OneOfOptionConfig):
title = "From Field"
schema_extra = {
"description": "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
}
description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
discriminator = "mode"


class CohereEmbeddingConfigModel(BaseModel):
mode: Literal["cohere"] = Field("cohere", const=True)
cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)

class Config:
class Config(OneOfOptionConfig):
title = "Cohere"
schema_extra = {"description": "Use the Cohere API to embed text."}
description = "Use the Cohere API to embed text."
discriminator = "mode"
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#

from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
from pydantic import BaseModel, Field


class AvroFormat(BaseModel):
class Config:
class Config(OneOfOptionConfig):
title = "Avro Format"
discriminator = "filetype"

filetype: str = Field(
"avro",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from enum import Enum
from typing import Any, Dict, List, Optional, Set, Union

from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
from pydantic import BaseModel, Field, ValidationError, root_validator, validator


Expand All @@ -21,8 +22,9 @@ class CsvHeaderDefinitionType(Enum):


class CsvHeaderFromCsv(BaseModel):
class Config:
class Config(OneOfOptionConfig):
title = "From CSV"
discriminator = "header_definition_type"

header_definition_type: str = Field(
CsvHeaderDefinitionType.FROM_CSV.value,
Expand All @@ -34,8 +36,9 @@ def has_header_row(self) -> bool:


class CsvHeaderAutogenerated(BaseModel):
class Config:
class Config(OneOfOptionConfig):
title = "Autogenerated"
discriminator = "header_definition_type"

header_definition_type: str = Field(
CsvHeaderDefinitionType.AUTOGENERATED.value,
Expand All @@ -47,8 +50,9 @@ def has_header_row(self) -> bool:


class CsvHeaderUserProvided(BaseModel):
class Config:
class Config(OneOfOptionConfig):
title = "User Provided"
discriminator = "header_definition_type"

header_definition_type: str = Field(
CsvHeaderDefinitionType.USER_PROVIDED.value,
Expand All @@ -74,8 +78,9 @@ def validate_column_names(cls, v: List[str]) -> List[str]:


class CsvFormat(BaseModel):
class Config:
class Config(OneOfOptionConfig):
title = "CSV Format"
discriminator = "filetype"

filetype: str = Field(
"csv",
Expand Down Expand Up @@ -123,7 +128,7 @@ class Config:
)
header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
title="CSV Header Definition",
default=CsvHeaderFromCsv(),
default=CsvHeaderFromCsv(header_definition_type=CsvHeaderDefinitionType.FROM_CSV.value),
description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
)
true_values: Set[str] = Field(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#

from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
from pydantic import BaseModel, Field


class JsonlFormat(BaseModel):
class Config:
class Config(OneOfOptionConfig):
title = "Jsonl Format"
discriminator = "filetype"

filetype: str = Field(
"jsonl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#

from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
from pydantic import BaseModel, Field


class ParquetFormat(BaseModel):
class Config:
class Config(OneOfOptionConfig):
title = "Parquet Format"
discriminator = "filetype"

filetype: str = Field(
"parquet",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#

from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
from pydantic import BaseModel, Field


class UnstructuredFormat(BaseModel):
class Config:
class Config(OneOfOptionConfig):
title = "Document File Type Format (Experimental)"
schema_extra = {"description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."}
description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
discriminator = "filetype"

filetype: str = Field(
"unstructured",
Expand Down
33 changes: 33 additions & 0 deletions airbyte-cdk/python/airbyte_cdk/utils/oneof_option_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#
# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
#

from typing import Any, Dict


class OneOfOptionConfig:
"""
Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.

Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).

Usage:

```python
class OptionModel(BaseModel):
mode: Literal["option_a"] = Field("option_a", const=True)
option_a_field: str = Field(...)

class Config(OneOfOptionConfig):
title = "Option A"
description = "Option A description"
discriminator = "mode"
```
"""

@staticmethod
def schema_extra(schema: Dict[str, Any], model: Any) -> None:
if hasattr(model.Config, "description"):
schema["description"] = model.Config.description
if hasattr(model.Config, "discriminator"):
schema.setdefault("required", []).append(model.Config.discriminator)
Loading
Loading