Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[file-based cdk] Remove CSV quoting_behavior config option #29388

Merged
merged 5 commits into from
Aug 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,6 @@
from typing_extensions import Literal


class QuotingBehavior(Enum):
QUOTE_ALL = "Quote All"
QUOTE_SPECIAL_CHARACTERS = "Quote Special Characters"
QUOTE_NONNUMERIC = "Quote Non-numeric"
QUOTE_NONE = "Quote None"


class InferenceType(Enum):
NONE = "None"
PRIMITIVE_TYPES_ONLY = "Primitive Types Only"
Expand Down Expand Up @@ -53,11 +46,6 @@ class Config:
double_quote: bool = Field(
title="Double Quote", default=True, description="Whether two quotes in a quoted CSV value denote a single quote in the data."
)
quoting_behavior: QuotingBehavior = Field(
title="Quoting Behavior",
default=QuotingBehavior.QUOTE_SPECIAL_CHARACTERS,
description="The quoting behavior determines when a value in a row should have quote marks added around it. For example, if Quote Non-numeric is specified, while reading, quotes are expected for row values that do not contain numbers. Or for Quote All, every row value will be expecting quotes.",
)
null_values: Set[str] = Field(
title="Null Values",
default=[],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from io import IOBase
from typing import Any, Callable, Dict, Generator, Iterable, List, Mapping, Optional, Set

from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType, QuotingBehavior
from airbyte_cdk.sources.file_based.config.csv_format import CsvFormat, InferenceType
from airbyte_cdk.sources.file_based.config.file_based_stream_config import FileBasedStreamConfig
from airbyte_cdk.sources.file_based.exceptions import FileBasedSourceError, RecordParseError
from airbyte_cdk.sources.file_based.file_based_stream_reader import AbstractFileBasedStreamReader, FileReadMode
Expand All @@ -21,13 +21,6 @@

DIALECT_NAME = "_config_dialect"

config_to_quoting: Mapping[QuotingBehavior, int] = {
QuotingBehavior.QUOTE_ALL: csv.QUOTE_ALL,
QuotingBehavior.QUOTE_SPECIAL_CHARACTERS: csv.QUOTE_MINIMAL,
QuotingBehavior.QUOTE_NONNUMERIC: csv.QUOTE_NONNUMERIC,
QuotingBehavior.QUOTE_NONE: csv.QUOTE_NONE,
}


class _CsvReader:
def read_data(
Expand All @@ -50,7 +43,7 @@ def read_data(
quotechar=config_format.quote_char,
escapechar=config_format.escape_char,
doublequote=config_format.double_quote,
quoting=config_to_quoting.get(config_format.quoting_behavior, csv.QUOTE_MINIMAL),
quoting=csv.QUOTE_MINIMAL,
)
with stream_reader.open_file(file, file_read_mode, config_format.encoding, logger) as fp:
headers = self._get_headers(fp, config_format, dialect_name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,23 @@
from typing import Any, Mapping, Type

import pytest as pytest
from airbyte_cdk.sources.file_based.config.csv_format import QuotingBehavior
from airbyte_cdk.sources.file_based.config.file_based_stream_config import CsvFormat, FileBasedStreamConfig
from pydantic import ValidationError


@pytest.mark.parametrize(
"file_type, input_format, expected_format, expected_error",
[
pytest.param("csv", {"filetype": "csv", "delimiter": "d", "quote_char": "q", "escape_char": "e", "encoding": "ascii", "double_quote": True, "quoting_behavior": "Quote All"}, {"filetype": "csv", "delimiter": "d", "quote_char": "q", "escape_char": "e", "encoding": "ascii", "double_quote": True, "quoting_behavior": QuotingBehavior.QUOTE_ALL}, None, id="test_valid_format"),
pytest.param("csv", {"filetype": "csv", "double_quote": False}, {"delimiter": ",", "quote_char": "\"", "encoding": "utf8", "double_quote": False, "quoting_behavior": QuotingBehavior.QUOTE_SPECIAL_CHARACTERS}, None, id="test_default_format_values"),
pytest.param("csv", {"filetype": "csv", "delimiter": "d", "quote_char": "q", "escape_char": "e", "encoding": "ascii", "double_quote": True}, {"filetype": "csv", "delimiter": "d", "quote_char": "q", "escape_char": "e", "encoding": "ascii", "double_quote": True}, None, id="test_valid_format"),
pytest.param("csv", {"filetype": "csv", "double_quote": False}, {"delimiter": ",", "quote_char": "\"", "encoding": "utf8", "double_quote": False}, None, id="test_default_format_values"),
pytest.param("csv", {"filetype": "csv", "delimiter": "nope", "double_quote": True}, None, ValidationError, id="test_invalid_delimiter"),
pytest.param("csv", {"filetype": "csv", "quote_char": "nope", "double_quote": True}, None, ValidationError, id="test_invalid_quote_char"),
pytest.param("csv", {"filetype": "csv", "escape_char": "nope", "double_quote": True}, None, ValidationError, id="test_invalid_escape_char"),
pytest.param("csv", {"filetype": "csv", "delimiter": ",", "quote_char": "\"", "encoding": "not_a_format", "double_quote": True}, {}, ValidationError, id="test_invalid_encoding_type"),
pytest.param("csv", {"filetype": "csv", "double_quote": True, "quoting_behavior": "Quote Invalid"}, None, ValidationError, id="test_invalid_quoting_behavior"),
pytest.param("invalid", {"filetype": "invalid", "double_quote": False}, {}, ValidationError, id="test_config_format_file_type_mismatch"),
]
)
def test_csv_config(file_type: str, input_format: Mapping[str, Any], expected_format: Mapping[str, QuotingBehavior], expected_error: Type[Exception]) -> None:
def test_csv_config(file_type: str, input_format: Mapping[str, Any], expected_format: Mapping[str, Any], expected_error: Type[Exception]) -> None:
stream_config = {
"name": "stream1",
"file_type": file_type,
Expand Down Expand Up @@ -58,7 +56,6 @@ def test_invalid_validation_policy() -> None:
"escape_char": "e",
"encoding": "ascii",
"double_quote": True,
"quoting_behavior": "Quote All"
},
}
with pytest.raises(ValidationError):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,6 @@
"default": True,
"type": "boolean",
},
"quoting_behavior": {
"title": "Quoting Behavior",
"description": "The quoting behavior determines when a value in a row should have quote marks added around it. For example, if Quote Non-numeric is specified, while reading, quotes are expected for row values that do not contain numbers. Or for Quote All, every row value will be expecting quotes.",
"default": "Quote Special Characters",
"enum": ["Quote All", "Quote Special Characters", "Quote Non-numeric", "Quote None"],
},
"null_values": {
"title": "Null Values",
"description": "A set of case-sensitive strings that should be interpreted as null values. For example, if the value 'NA' should be interpreted as null, enter 'NA' in this field.",
Expand Down Expand Up @@ -781,7 +775,6 @@
"quote_char": "|",
"escape_char": "!",
"double_quote": True,
"quoting_behavior": "Quote Special Characters",
},
}
]
Expand Down Expand Up @@ -896,7 +889,6 @@
"escape_char": "@",
"double_quote": True,
"newlines_in_values": False,
"quoting_behavior": "Quote All",
},
},
]
Expand Down Expand Up @@ -1733,7 +1725,9 @@
"file_type": "csv",
"globs": ["*"],
"validation_policy": "Emit Record",
"format": {"filetype": "csv", "quoting_behavior": "Quote All"},
"format": {
"filetype": "csv",
}
}
],
"start_date": "2023-06-04T03:54:07.000000Z",
Expand Down Expand Up @@ -1883,8 +1877,7 @@
"quote_char": '"',
"delimiter": ",",
"escape_char": "\\",
"quoting_behavior": "Quote All",
},
}
}
],
"start_date": "2023-06-04T03:54:07.000000Z",
Expand Down Expand Up @@ -1955,8 +1948,7 @@
"double_quotes": True,
"quote_char": '"',
"delimiter": ",",
"quoting_behavior": "Quote All",
},
}
}
],
"start_date": "2023-06-04T03:54:07.000000Z",
Expand Down