airbytehq · flash1293 · Nov 9, 2023 · Nov 7, 2023 · Nov 7, 2023 · Nov 7, 2023
diff --git a/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/config.py b/airbyte-cdk/python/airbyte_cdk/destinations/vector_db_based/config.py
@@ -4,6 +4,7 @@
 
 from typing import List, Literal, Optional, Union
 
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field
 
 
@@ -16,11 +17,10 @@ class SeparatorSplitterConfigModel(BaseModel):
     )
     keep_separator: bool = Field(default=False, title="Keep separator", description="Whether to keep the separator in the resulting chunks")
 
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "By Separator"
-        schema_extra = {
-            "description": "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
-        }
+        description = "Split the text by the list of separators until the chunk size is reached, using the earlier mentioned separators where possible. This is useful for splitting text fields by paragraphs, sentences, words, etc."
+        discriminator = "mode"
 
 
 class MarkdownHeaderSplitterConfigModel(BaseModel):
@@ -33,11 +33,10 @@ class MarkdownHeaderSplitterConfigModel(BaseModel):
         ge=1,
     )
 
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "By Markdown header"
-        schema_extra = {
-            "description": "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
-        }
+        description = "Split the text by Markdown headers down to the specified header level. If the chunk size fits multiple sections, they will be combined into a single chunk."
+        discriminator = "mode"
 
 
 class CodeSplitterConfigModel(BaseModel):
@@ -65,11 +64,12 @@ class CodeSplitterConfigModel(BaseModel):
         ],
     )
 
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "By Programming Language"
-        schema_extra = {
-            "description": "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
-        }
+        description = (
+            "Split the text by suitable delimiters based on the programming language. This is useful for splitting code into chunks."
+        )
+        discriminator = "mode"
 
 
 TextSplitterConfigModel = Union[SeparatorSplitterConfigModel, MarkdownHeaderSplitterConfigModel, CodeSplitterConfigModel]
@@ -128,11 +128,12 @@ class OpenAIEmbeddingConfigModel(BaseModel):
     mode: Literal["openai"] = Field("openai", const=True)
     openai_key: str = Field(..., title="OpenAI API key", airbyte_secret=True)
 
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "OpenAI"
-        schema_extra = {
-            "description": "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
-        }
+        description = (
+            "Use the OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
+        )
+        discriminator = "mode"
 
 
 class OpenAICompatibleEmbeddingConfigModel(BaseModel):
@@ -151,9 +152,10 @@ class OpenAICompatibleEmbeddingConfigModel(BaseModel):
         title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
     )
 
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "OpenAI-compatible"
-        schema_extra = {"description": "Use a service that's compatible with the OpenAI API to embed text."}
+        description = "Use a service that's compatible with the OpenAI API to embed text."
+        discriminator = "mode"
 
 
 class AzureOpenAIEmbeddingConfigModel(BaseModel):
@@ -177,21 +179,19 @@ class AzureOpenAIEmbeddingConfigModel(BaseModel):
         examples=["your-resource-name"],
     )
 
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Azure OpenAI"
-        schema_extra = {
-            "description": "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
-        }
+        description = "Use the Azure-hosted OpenAI API to embed text. This option is using the text-embedding-ada-002 model with 1536 embedding dimensions."
+        discriminator = "mode"
 
 
 class FakeEmbeddingConfigModel(BaseModel):
     mode: Literal["fake"] = Field("fake", const=True)
 
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Fake"
-        schema_extra = {
-            "description": "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
-        }
+        description = "Use a fake embedding made out of random vectors with 1536 embedding dimensions. This is useful for testing the data pipeline without incurring any costs."
+        discriminator = "mode"
 
 
 class FromFieldEmbeddingConfigModel(BaseModel):
@@ -203,17 +203,17 @@ class FromFieldEmbeddingConfigModel(BaseModel):
         ..., title="Embedding dimensions", description="The number of dimensions the embedding model is generating", examples=[1536, 384]
     )
 
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "From Field"
-        schema_extra = {
-            "description": "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
-        }
+        description = "Use a field in the record as the embedding. This is useful if you already have an embedding for your data and want to store it in the vector store."
+        discriminator = "mode"
 
 
 class CohereEmbeddingConfigModel(BaseModel):
     mode: Literal["cohere"] = Field("cohere", const=True)
     cohere_key: str = Field(..., title="Cohere API key", airbyte_secret=True)
 
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Cohere"
-        schema_extra = {"description": "Use the Cohere API to embed text."}
+        description = "Use the Cohere API to embed text."
+        discriminator = "mode"
diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/avro_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/avro_format.py
@@ -2,12 +2,14 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
 
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field
 
 
 class AvroFormat(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Avro Format"
+        discriminator = "filetype"
 
     filetype: str = Field(
         "avro",

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/csv_format.py
@@ -6,6 +6,7 @@
 from enum import Enum
 from typing import Any, Dict, List, Optional, Set, Union
 
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field, ValidationError, root_validator, validator
 
 
@@ -21,8 +22,9 @@ class CsvHeaderDefinitionType(Enum):
 
 
 class CsvHeaderFromCsv(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "From CSV"
+        discriminator = "header_definition_type"
 
     header_definition_type: str = Field(
         CsvHeaderDefinitionType.FROM_CSV.value,
@@ -34,8 +36,9 @@ def has_header_row(self) -> bool:
 
 
 class CsvHeaderAutogenerated(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Autogenerated"
+        discriminator = "header_definition_type"
 
     header_definition_type: str = Field(
         CsvHeaderDefinitionType.AUTOGENERATED.value,
@@ -47,8 +50,9 @@ def has_header_row(self) -> bool:
 
 
 class CsvHeaderUserProvided(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "User Provided"
+        discriminator = "header_definition_type"
 
     header_definition_type: str = Field(
         CsvHeaderDefinitionType.USER_PROVIDED.value,
@@ -74,8 +78,9 @@ def validate_column_names(cls, v: List[str]) -> List[str]:
 
 
 class CsvFormat(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "CSV Format"
+        discriminator = "filetype"
 
     filetype: str = Field(
         "csv",
@@ -123,7 +128,7 @@ class Config:
     )
     header_definition: Union[CsvHeaderFromCsv, CsvHeaderAutogenerated, CsvHeaderUserProvided] = Field(
         title="CSV Header Definition",
-        default=CsvHeaderFromCsv(),
+        default=CsvHeaderFromCsv(header_definition_type=CsvHeaderDefinitionType.FROM_CSV.value),
         description="How headers will be defined. `User Provided` assumes the CSV does not have a header row and uses the headers provided and `Autogenerated` assumes the CSV does not have a header row and the CDK will generate headers using for `f{i}` where `i` is the index starting from 0. Else, the default behavior is to use the header from the CSV file. If a user wants to autogenerate or provide column names for a CSV having headers, they can skip rows.",
     )
     true_values: Set[str] = Field(

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/jsonl_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/jsonl_format.py
@@ -2,12 +2,14 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
 
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field
 
 
 class JsonlFormat(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Jsonl Format"
+        discriminator = "filetype"
 
     filetype: str = Field(
         "jsonl",

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/parquet_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/parquet_format.py
@@ -2,12 +2,14 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
 
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field
 
 
 class ParquetFormat(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Parquet Format"
+        discriminator = "filetype"
 
     filetype: str = Field(
         "parquet",

diff --git a/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/unstructured_format.py b/airbyte-cdk/python/airbyte_cdk/sources/file_based/config/unstructured_format.py
@@ -2,13 +2,15 @@
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
 
+from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
 from pydantic import BaseModel, Field
 
 
 class UnstructuredFormat(BaseModel):
-    class Config:
+    class Config(OneOfOptionConfig):
         title = "Document File Type Format (Experimental)"
-        schema_extra = {"description": "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."}
+        description = "Extract text from document formats (.pdf, .docx, .md, .pptx) and emit as one record per file."
+        discriminator = "filetype"
 
     filetype: str = Field(
         "unstructured",

diff --git a/airbyte-cdk/python/airbyte_cdk/utils/oneof_option_config.py b/airbyte-cdk/python/airbyte_cdk/utils/oneof_option_config.py
@@ -0,0 +1,33 @@
+#
+# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
+#
+
+from typing import Any, Dict
+
+
+class OneOfOptionConfig:
+    """
+    Base class to configure a Pydantic model that's used as a oneOf option in a parent model in a way that's compatible with all Airbyte consumers.
+
+    Inherit from this class in the nested Config class in a model and set title and description (these show up in the UI) and discriminator (this is making sure it's marked as required in the schema).
+
+    Usage:
+
+        ```python
+        class OptionModel(BaseModel):
+            mode: Literal["option_a"] = Field("option_a", const=True)
+            option_a_field: str = Field(...)
+
+            class Config(OneOfOptionConfig):
+                title = "Option A"
+                description = "Option A description"
+                discriminator = "mode"
+        ```
+    """
+
+    @staticmethod
+    def schema_extra(schema: Dict[str, Any], model: Any) -> None:
+        if hasattr(model.Config, "description"):
+            schema["description"] = model.Config.description
+        if hasattr(model.Config, "discriminator"):
+            schema.setdefault("required", []).append(model.Config.discriminator)