From 85ed413f1d4f29dabaa40de949964c461d29d9ae Mon Sep 17 00:00:00 2001 From: ohadmata Date: Thu, 25 Jan 2024 15:17:57 +0200 Subject: [PATCH 1/2] support csv with single column --- examples/read_csv.py | 2 +- examples/utils.py | 3 +- src/shmessy/__init__.py | 42 +++++++++++++------------- src/shmessy/exceptions.py | 11 ------- tests/data/data_7.csv | 9 ++++++ tests/intg/test_read_csv.py | 59 +++++++++++++++++++++---------------- 6 files changed, 66 insertions(+), 60 deletions(-) create mode 100644 tests/data/data_7.csv diff --git a/examples/read_csv.py b/examples/read_csv.py index 059d1c9..7faf7c7 100644 --- a/examples/read_csv.py +++ b/examples/read_csv.py @@ -5,6 +5,6 @@ if __name__ == "__main__": init_logger() shmessy = Shmessy() - df = shmessy.read_csv('../tests/data/data_4.csv') + df = shmessy.read_csv('../tests/data/data_7.csv') inferred_schema = shmessy.get_inferred_schema() pretty_print_df(df=df, inferred_schema=inferred_schema) diff --git a/examples/utils.py b/examples/utils.py index 24d511b..b758d4a 100644 --- a/examples/utils.py +++ b/examples/utils.py @@ -27,11 +27,10 @@ def pretty_print_df( df: DataFrame, *, sample_size: Optional[int] = 10, - sort_key: Optional[str] = "id", inferred_schema: Optional[ShmessySchema] = None ) -> None: df = df[:sample_size] - df = df.sort_values(by=[sort_key]) + df = df.sort_values(df.columns[0]) df = df.rename(columns=add_data_types_to_column_names(df, inferred_schema)) print(tabulate(df, headers="keys", tablefmt="rounded_outline", showindex=False)) diff --git a/src/shmessy/__init__.py b/src/shmessy/__init__.py index 8e70b0d..dd3c2d6 100644 --- a/src/shmessy/__init__.py +++ b/src/shmessy/__init__.py @@ -92,27 +92,29 @@ def read_csv( fix_column_names: Optional[bool] = False, ) -> DataFrame: try: + dialect = None + if use_sniffer: - dialect = csv.Sniffer().sniff( - sample=_get_sample_from_csv( - filepath_or_buffer=filepath_or_buffer, - sample_size=self.__sample_size, - encoding=self.__reader_encoding, - ), - delimiters="".join([",", "\t", ";", " ", ":"]), - ) - df = pd.read_csv( - filepath_or_buffer=filepath_or_buffer, - dialect=dialect(), - low_memory=False, - encoding=self.__reader_encoding, - ) - else: - df = pd.read_csv( - filepath_or_buffer=filepath_or_buffer, - low_memory=False, - encoding=self.__reader_encoding, - ) + try: + dialect = csv.Sniffer().sniff( + sample=_get_sample_from_csv( + filepath_or_buffer=filepath_or_buffer, + sample_size=self.__sample_size, + encoding=self.__reader_encoding, + ), + delimiters="".join([",", "\t", ";", " ", ":"]), + ) + except Exception as e: # noqa + logger.debug( + f"Could not use python sniffer to infer csv schema, Using pandas default settings: {e}" + ) + + df = pd.read_csv( + filepath_or_buffer=filepath_or_buffer, + dialect=dialect() if dialect else None, + low_memory=False, + encoding=self.__reader_encoding, + ) if fixed_schema is None: fixed_schema = self.infer_schema(df) diff --git a/src/shmessy/exceptions.py b/src/shmessy/exceptions.py index dc6f80b..a63cc19 100644 --- a/src/shmessy/exceptions.py +++ b/src/shmessy/exceptions.py @@ -4,9 +4,6 @@ def exception_router(exception: Exception): error_message = str(exception) - if "Could not determine delimiter" in error_message: - raise CouldNotDetermineDelimiterException() - match = re.match( r"(.*)'(.*)' codec can't decode byte (.*) in position (.*):(.*)", error_message ) @@ -64,11 +61,3 @@ def __init__(self, expected_encoding: str): super().__init__( f"The given file cannot be read using {expected_encoding} encoding." ) - - -class CouldNotDetermineDelimiterException(ShmessyException): - def __init__(self): - super().__init__( - "Could not determine delimiter. " - "Make sure a delimiter is shown the same number of times on every row in the file." - ) diff --git a/tests/data/data_7.csv b/tests/data/data_7.csv new file mode 100644 index 0000000..8eff507 --- /dev/null +++ b/tests/data/data_7.csv @@ -0,0 +1,9 @@ +header_name +this +is +csv +file +with +single +column +only \ No newline at end of file diff --git a/tests/intg/test_read_csv.py b/tests/intg/test_read_csv.py index 731d9cf..1a306b3 100644 --- a/tests/intg/test_read_csv.py +++ b/tests/intg/test_read_csv.py @@ -1,4 +1,4 @@ -from numpy import dtypes +import numpy as np from shmessy import Shmessy @@ -6,33 +6,33 @@ def test_read_csv(files_folder): df = Shmessy().read_csv(files_folder.as_posix() + "/data_1.csv") - assert isinstance(df["created_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["modified_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["deleted_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["celebrated_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["joined_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["laughed_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["loled_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["fooled_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["emerged_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["processed_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["isolated_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["scheduled_at"].dtype, dtypes.DateTime64DType) - assert isinstance(df["unixed_at"].dtype, dtypes.DateTime64DType) + assert df["created_at"].dtype == np.dtype("datetime64[ns]") + assert df["modified_at"].dtype == np.dtype("datetime64[ns]") + assert df["deleted_at"].dtype == np.dtype("datetime64[ns]") + assert df["celebrated_at"].dtype == np.dtype("datetime64[ns]") + assert df["joined_at"].dtype == np.dtype("datetime64[ns]") + assert df["laughed_at"].dtype == np.dtype("datetime64[ns]") + assert df["loled_at"].dtype == np.dtype("datetime64[ns]") + assert df["fooled_at"].dtype == np.dtype("datetime64[ns]") + assert df["emerged_at"].dtype == np.dtype("datetime64[ns]") + assert df["processed_at"].dtype == np.dtype("datetime64[ns]") + assert df["isolated_at"].dtype == np.dtype("datetime64[ns]") + assert df["scheduled_at"].dtype == np.dtype("datetime64[ns]") + assert df["unixed_at"].dtype == np.dtype("datetime64[ns]") def test_read_csv_colon_as_delimiter(files_folder): df = Shmessy().read_csv(files_folder.as_posix() + "/data_3.csv") - assert isinstance(df["id"].dtype, dtypes.Int64DType) - assert isinstance(df["name"].dtype, dtypes.ObjectDType) - assert isinstance(df["value"].dtype, dtypes.Int64DType) + assert df["id"].dtype == np.dtype("int64") + assert df["name"].dtype == np.dtype("O") + assert df["value"].dtype == np.dtype("int64") def test_read_csv_semicolon_as_delimiter(files_folder): df = Shmessy().read_csv(files_folder.as_posix() + "/data_4.csv") - assert isinstance(df["id"].dtype, dtypes.Int64DType) - assert isinstance(df["name"].dtype, dtypes.ObjectDType) - assert isinstance(df["value"].dtype, dtypes.Int64DType) + assert df["id"].dtype == np.dtype("int64") + assert df["name"].dtype == np.dtype("O") + assert df["value"].dtype == np.dtype("int64") def test_buffer_as_read_csv_input(files_folder): @@ -40,9 +40,9 @@ def test_buffer_as_read_csv_input(files_folder): with open(path, mode="rt") as file_input: df = Shmessy().read_csv(file_input) - assert isinstance(df["id"].dtype, dtypes.Int64DType) - assert isinstance(df["name"].dtype, dtypes.ObjectDType) - assert isinstance(df["value"].dtype, dtypes.Int64DType) + assert df["id"].dtype == np.dtype("int64") + assert df["name"].dtype == np.dtype("O") + assert df["value"].dtype == np.dtype("int64") def test_binary_buffer_as_read_csv_input(files_folder): @@ -50,6 +50,13 @@ def test_binary_buffer_as_read_csv_input(files_folder): with open(path, mode="rb") as file_input: df = Shmessy().read_csv(file_input) - assert isinstance(df["id"].dtype, dtypes.Int64DType) - assert isinstance(df["name"].dtype, dtypes.ObjectDType) - assert isinstance(df["value"].dtype, dtypes.Int64DType) + assert df["id"].dtype == np.dtype("int64") + assert df["name"].dtype == np.dtype("O") + assert df["value"].dtype == np.dtype("int64") + + +def test_read_csv_file_with_single_column(files_folder): + path = files_folder.as_posix() + "/data_7.csv" + with open(path, mode="rb") as file_input: + df = Shmessy().read_csv(file_input) + assert df["header_name"].dtype == np.dtype("O") From bb9c6f2638c8cdf61b6f923afa98145d7736f6af Mon Sep 17 00:00:00 2001 From: ohadmata Date: Thu, 25 Jan 2024 15:19:02 +0200 Subject: [PATCH 2/2] support csv with single column --- examples/read_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/read_csv.py b/examples/read_csv.py index 7faf7c7..059d1c9 100644 --- a/examples/read_csv.py +++ b/examples/read_csv.py @@ -5,6 +5,6 @@ if __name__ == "__main__": init_logger() shmessy = Shmessy() - df = shmessy.read_csv('../tests/data/data_7.csv') + df = shmessy.read_csv('../tests/data/data_4.csv') inferred_schema = shmessy.get_inferred_schema() pretty_print_df(df=df, inferred_schema=inferred_schema)