From 070937a7c2565532fb61de29abc90d4650757ecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Szczepanik?= Date: Tue, 21 Nov 2023 12:35:50 +0100 Subject: [PATCH] Add an encoding argument to tabby loader When an encoding is explicitly specified, it will be used. Otherwise, default encoding used by Path.open will be tried, and charset_normalizer will be used to guess if that fails. --- datalad_tabby/io/load.py | 41 +++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/datalad_tabby/io/load.py b/datalad_tabby/io/load.py index dc43565..94ae273 100644 --- a/datalad_tabby/io/load.py +++ b/datalad_tabby/io/load.py @@ -75,10 +75,11 @@ def __init__( self._jsonld = jsonld self._recursive = recursive - def __call__(self, src: Path, *, single: bool = True): + def __call__(self, src: Path, *, single: bool = True, encoding: str | None = None): return (self._load_single if single else self._load_many)( src=src, trace=[], + encoding=encoding, ) def _load_single( @@ -86,6 +87,7 @@ def _load_single( *, src: Path, trace: List, + encoding: str | None = None, ) -> Dict: jfpath = self._get_corresponding_jsondata_fpath(src) obj = json.load(jfpath.open()) if jfpath.exists() else {} @@ -97,13 +99,16 @@ def _load_single( trace=trace, ) - try: - tsv_obj = self._parse_tsv_single(src) - except UnicodeDecodeError: - # by default Path.open() uses locale.getencoding() - # that didn't work, try guessing - encoding = cs_from_path(src).best().encoding + if encoding is not None: tsv_obj = self._parse_tsv_single(src, encoding=encoding) + else: + try: + tsv_obj = self._parse_tsv_single(src) + except UnicodeDecodeError: + # by default Path.open() uses locale.getencoding() + # that didn't work, try guessing + encoding = cs_from_path(src).best().encoding + tsv_obj = self._parse_tsv_single(src, encoding=encoding) obj.update(tsv_obj) @@ -140,6 +145,7 @@ def _load_many( *, src: Path, trace: List, + encoding: str | None = None, ) -> List[Dict]: obj_tmpl = {} array = list() @@ -160,17 +166,22 @@ def _load_many( # the table field/column names have purposefully _nothing_ # to do with any possibly loaded JSON data - try: - tsv_array = self._parse_tsv_many( - src, obj_tmpl, trace=trace, fieldnames=None - ) - except UnicodeDecodeError: - # by default Path.open() uses locale.getencoding() - # that didn't work, try guessing - encoding = cs_from_path(src).best().encoding + if encoding is not None: tsv_array = self._parse_tsv_many( src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding ) + else: + try: + tsv_array = self._parse_tsv_many( + src, obj_tmpl, trace=trace, fieldnames=None + ) + except UnicodeDecodeError: + # by default Path.open() uses locale.getencoding() + # that didn't work, try guessing + encoding = cs_from_path(src).best().encoding + tsv_array = self._parse_tsv_many( + src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding + ) array.extend(tsv_array)