Skip to content

Commit

Permalink
ingestion: Use utf-8 if encoding detection fails
Browse files Browse the repository at this point in the history
  • Loading branch information
abhidg committed Jul 5, 2021
1 parent 4093c5f commit a179490
Showing 1 changed file with 6 additions and 1 deletion.
7 changes: 6 additions & 1 deletion ingestion/functions/retrieval/retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
SOURCE_ID_FIELD = "sourceId"
PARSING_DATE_RANGE_FIELD = "parsingDateRange"
TIME_FILEPART_FORMAT = "/%Y/%m/%d/%H%M/"
DEFAULT_ENCODING = 'utf-8'
READ_CHUNK_BYTES = 2048
HEADER_CHUNK_BYTES = 1024 * 1024
CSV_CHUNK_BYTES = 2 * 1024 * 1024
Expand Down Expand Up @@ -154,7 +155,11 @@ def retrieve_content(
# Read 2MB to be quite sure about the encoding.
detected_enc = detect(bytesio.read(2 << 20))
bytesio.seek(0)
print(f'Source encoding is presumably {detected_enc}')
if detected_enc['encoding']:
print(f'Source encoding is presumably {detected_enc}')
else:
detected_enc['encoding'] = DEFAULT_ENCODING
print(f'Source encoding detection failed, setting to {DEFAULT_ENCODING}')
fd, outfile_name = tempfile.mkstemp(dir=tempdir)
with os.fdopen(fd, "w", encoding='utf-8') as outfile:
text_stream = codecs.getreader(detected_enc['encoding'])(bytesio)
Expand Down

0 comments on commit a179490

Please sign in to comment.