From 3fc53cc279ba5ab74e04cee9af693c05d9995fcd Mon Sep 17 00:00:00 2001 From: Paul Hummer Date: Thu, 20 Apr 2023 13:35:46 -0600 Subject: [PATCH] fix: remove cchardet as a dependency `cchardet` hasn't had a commit in almost two years (8 days shy as of this commit). [This issue](https://github.com/PyYoshi/cChardet/issues/81) is a six month old bug showing that `cchardet` doesn't support python 3.11, which is the most recent python. `cchardet` is fast, certainly, though the usage here is not as performance sensitive as would warrant its use over `chardet`, e.g. a difference of 800ms is probably not something that is worth keeping problematic dependencies around. --- requirements_dev.txt | 1 - setup.cfg | 1 - src/watchful/client.py | 18 +++++------------- 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/requirements_dev.txt b/requirements_dev.txt index 2b029a2..a0d8643 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -5,7 +5,6 @@ pyflakes==2.4.0 pylama==8.3.8 pylint==2.17.2 twine==4.0.1 -cchardet==2.1.7 chardet==5.1.0 flair==0.12.2 psutil==5.9.2 diff --git a/setup.cfg b/setup.cfg index 49a769a..37da639 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,7 +22,6 @@ license_files = package_dir = = src install_requires = - cchardet>=2.1.7 chardet>=5.1.0 flair>=0.11.3 psutil>=5.9.2 diff --git a/src/watchful/client.py b/src/watchful/client.py index 024c86f..54aa801 100644 --- a/src/watchful/client.py +++ b/src/watchful/client.py @@ -17,6 +17,8 @@ import urllib from typing import Callable, Dict, Generator, List, Literal, Optional, Union from uuid import uuid4 + +import chardet import requests @@ -1172,7 +1174,6 @@ def is_utf8( csv_bytes: bytes = None, filepath: str = None, threshold: float = 0.5, - is_fast: bool = True, ) -> bool: """ This function attempts to detect if the encoding of the given bytes or the @@ -1191,7 +1192,6 @@ def is_utf8( :type threshold: float, optional :param is_fast: Whether to use fast encoding detection with a lower accuracy, or not. - :type is_fast: bool, optional :return: `True` if the detected encoding is utf-8 and has a confidence of the given threshold or more, otherwise `False`. :rtype: bool @@ -1207,11 +1207,6 @@ def is_utf8( "Only one of them needs to be specified." ) - if is_fast: - import cchardet as chardet - else: - import chardet - if csv_bytes: res = chardet.detect(csv_bytes) else: @@ -1239,7 +1234,7 @@ def create_dataset( filename: str = "none", has_header: bool = True, threshold_detect: float = 0.5, - is_fast_detect: bool = True, + is_fast_detect: bool = True, # pylint: disable=W0613 force_load: bool = True, ) -> str: """ @@ -1259,8 +1254,7 @@ def create_dataset( :param threshold_detect: The minimum confidence required to accept the detected encoding. :type threshold_detect: float, optional - :param is_fast_detect: Whether to use fast encoding detection with a lower - accuracy, or not. + :param is_fast_detect: No longer used, but remains for API compatibility :type is_fast_detect: bool, optional :param force_load: The boolean indicating if the csv dataset will be loaded even when its encoding is detected to be non-utf-8, defaults to True. @@ -1274,9 +1268,7 @@ def create_dataset( TODO: Add error handling. """ - is_csv_bytes_utf8 = is_utf8( - csv_bytes, None, threshold_detect, is_fast_detect - ) + is_csv_bytes_utf8 = is_utf8(csv_bytes, None, threshold_detect) if is_csv_bytes_utf8 or force_load: id_ = str(uuid4())