diff --git a/integration-tests/test_s3_buffering.py b/integration-tests/test_s3_buffering.py new file mode 100644 index 00000000..6346c1ce --- /dev/null +++ b/integration-tests/test_s3_buffering.py @@ -0,0 +1,23 @@ +from smart_open import open + + +def read_bytes(url, limit): + bytes_ = [] + with open(url, 'rb') as fin: + for i in range(limit): + bytes_.append(fin.read(1)) + + return bytes_ + + +def test(benchmark): + # + # This file is around 850MB. + # + url = ( + 's3://commoncrawl/crawl-data/CC-MAIN-2019-51/segments/1575541319511.97' + '/warc/CC-MAIN-20191216093448-20191216121448-00559.warc.gz' + ) + limit = 1000000 + bytes_ = benchmark(read_bytes, url, limit) + assert len(bytes_) == limit diff --git a/smart_open/s3.py b/smart_open/s3.py index 96bce460..93d61ffe 100644 --- a/smart_open/s3.py +++ b/smart_open/s3.py @@ -314,10 +314,6 @@ def read(self, size=-1): if self._eof: return self._read_from_buffer() - # - # Fill our buffer to the required size. - # - # logger.debug('filling %r byte-long buffer up to %r bytes', len(self._buffer), size) self._fill_buffer(size) return self._read_from_buffer(size) @@ -430,7 +426,7 @@ def _read_from_buffer(self, size=-1): return part def _fill_buffer(self, size=-1): - size = size if size >= 0 else self._buffer._chunk_size + size = max(size, self._buffer._chunk_size) while len(self._buffer) < size and not self._eof: bytes_read = self._buffer.fill(self._raw_reader) if bytes_read == 0: