From 99e4cb58b9847452e41dc1a6361862423fdab7e2 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Mon, 9 Mar 2020 12:52:33 +1100 Subject: [PATCH 1/3] improve buffering efficiency --- smart_open/s3.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/smart_open/s3.py b/smart_open/s3.py index 96bce460..93d61ffe 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -314,10 +314,6 @@ def read(self, size=-1): if self._eof: return self._read_from_buffer() - # - # Fill our buffer to the required size. - # - # logger.debug('filling %r byte-long buffer up to %r bytes', len(self._buffer), size) self._fill_buffer(size) return self._read_from_buffer(size) @@ -430,7 +426,7 @@ def _read_from_buffer(self, size=-1): return part def _fill_buffer(self, size=-1): - size = size if size >= 0 else self._buffer._chunk_size + size = max(size, self._buffer._chunk_size) while len(self._buffer) < size and not self._eof: bytes_read = self._buffer.fill(self._raw_reader) if bytes_read == 0: From 3055c9d73b20eb40a38e7fde2584419aed929da5 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Wed, 11 Mar 2020 13:43:01 +1100 Subject: [PATCH 2/3] add benchmarks Before: ------------------------------------------- benchmark: 1 tests ------------------------------------------- Name (time in s) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations ---------------------------------------------------------------------------------------------------------- test 4.8925 10.1093 5.9906 2.3032 5.0104 1.3963 1;1 0.1669 5 1 ---------------------------------------------------------------------------------------------------------- After: ------------------------------------------- benchmark: 1 tests ------------------------------------------ Name (time in s) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations --------------------------------------------------------------------------------------------------------- test 4.9611 9.7707 5.9822 2.1190 5.0280 1.3168 1;1 0.1672 5 1 --------------------------------------------------------------------------------------------------------- --- integration-tests/test_s3_buffering.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 integration-tests/test_s3_buffering.py diff --git a/integration-tests/test_s3_buffering.py b/integration-tests/test_s3_buffering.py new file mode 100644 index 00000000..1e17cf95 --- /dev/null +++ b/integration-tests/test_s3_buffering.py @@ -0,0 +1,24 @@ +import sys +from smart_open import open + + +def read_bytes(url, limit): + bytes_ = [] + with open(url, 'rb') as fin: + for i in range(limit): + bytes_.append(fin.read(1)) + + return bytes_ + + +def test(benchmark): + # + # This file is around 850MB. + # + url = ( + 's3://commoncrawl/crawl-data/CC-MAIN-2019-51/segments/1575541319511.97' + '/warc/CC-MAIN-20191216093448-20191216121448-00559.warc.gz' + ) + limit = 1000000 + bytes_ = benchmark(read_bytes, url, limit) + assert len(bytes_) == limit From b9e6684f309e0cb1b9c1bb11c6ebf12da585a1cf Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Wed, 11 Mar 2020 13:47:17 +1100 Subject: [PATCH 3/3] remove unused import --- integration-tests/test_s3_buffering.py | 1 - 1 file changed, 1 deletion(-) diff --git a/integration-tests/test_s3_buffering.py b/integration-tests/test_s3_buffering.py index 1e17cf95..6346c1ce 100644 --- a/integration-tests/test_s3_buffering.py +++ b/integration-tests/test_s3_buffering.py @@ -1,4 +1,3 @@ -import sys from smart_open import open