piskvorky · menshikh-iv · Dec 6, 2017 · Dec 2, 2017 · Dec 2, 2017 · Dec 2, 2017
diff --git a/smart_open/s3.py b/smart_open/s3.py
@@ -28,6 +28,10 @@
 MODES = (READ, READ_BINARY, WRITE, WRITE_BINARY)
 """Allowed I/O modes for working with S3."""
 
+BINARY_NEWLINE = b'\n'
+TEXT_NEWLINE = b'\n'
+DEFAULT_BUFFER_SIZE = 256 * 1024
+
 
 def _range_string(start, stop=None):
     #
@@ -47,7 +51,6 @@ def open(bucket_id, key_id, mode, **kwargs):
     if mode not in MODES:
         raise NotImplementedError('bad mode: %r expected one of %r' % (mode, MODES))
 
-    buffer_size = kwargs.pop("buffer_size", io.DEFAULT_BUFFER_SIZE)
     encoding = kwargs.pop("encoding", "utf-8")
     errors = kwargs.pop("errors", None)
     newline = kwargs.pop("newline", None)
@@ -96,7 +99,8 @@ class BufferedInputBase(io.BufferedIOBase):
 
     Implements the io.BufferedIOBase interface of the standard library."""
 
-    def __init__(self, bucket, key, **kwargs):
+    def __init__(self, bucket, key, buffer_size=DEFAULT_BUFFER_SIZE,
+                 line_terminator=BINARY_NEWLINE, **kwargs):
         session = boto3.Session(profile_name=kwargs.pop('profile_name', None))
         s3 = session.resource('s3', **kwargs)
         self._object = s3.Object(bucket, key)
@@ -105,6 +109,8 @@ def __init__(self, bucket, key, **kwargs):
         self._current_pos = 0
         self._buffer = b''
         self._eof = False
+        self._buffer_size = buffer_size
+        self._line_terminator = line_terminator
 
         #
         # This member is part of the io.BufferedIOBase interface.
@@ -195,14 +201,7 @@ def read(self, size=-1):
         # Fill our buffer to the required size.
         #
         # logger.debug('filling %r byte-long buffer up to %r bytes', len(self._buffer), size)
-        while len(self._buffer) < size and not self._eof:
-            raw = self._raw_reader.read(size=io.DEFAULT_BUFFER_SIZE)
-            if len(raw):
-                self._buffer += raw
-            else:
-                logger.debug('reached EOF while filling buffer')
-                self._eof = True
-
+        self._fill_buffer(size)
         return self._read_from_buffer(size)
 
     def read1(self, size=-1):
@@ -218,6 +217,30 @@ def readinto(self, b):
         b[:len(data)] = data
         return len(data)
 
+    def readline(self, limit=-1):
+        """Read up to and including the next newline.  Returns the bytes read."""
+        if limit != -1:
+            raise NotImplementedError('limits other than -1 not implemented yet')
+        the_line = io.BytesIO()
+        while not (self._eof and len(self._buffer) == 0):
+            #
+            # In the worst case, we're reading self._buffer twice here, once in
+            # the if condition, and once when calling index.
+            #
+            # This is sub-optimal, but better than the alternative: wrapping
+            # .index in a try..except, because that is slower.
+            #
+            if self._line_terminator in self._buffer:
+                next_newline = self._buffer.index(self._line_terminator)
+                the_line.write(self._buffer[:next_newline + 1])
+                self._buffer = self._buffer[next_newline + 1:]
+                break
+            else:
+                the_line.write(self._buffer)
+                self._buffer = b''
+                self._fill_buffer(self._buffer_size)
+        return the_line.getvalue()
+
     def terminate(self):
         """Do nothing."""
         pass
@@ -235,6 +258,15 @@ def _read_from_buffer(self, size):
         # logger.debug('part: %r', part)
         return part
 
+    def _fill_buffer(self, size):
+        while len(self._buffer) < size and not self._eof:
+            raw = self._raw_reader.read(size=self._buffer_size)
+            if len(raw):
+                self._buffer += raw
+            else:
+                logger.debug('reached EOF while filling buffer')
+                self._eof = True
+
 
 class BufferedOutputBase(io.BufferedIOBase):
     """Writes bytes to S3.

diff --git a/smart_open/tests/test_s3.py b/smart_open/tests/test_s3.py
@@ -9,7 +9,7 @@
 else:
     import unittest
 
-import boto
+import boto3
 import moto
 
 import smart_open
@@ -21,15 +21,11 @@
 def create_bucket_and_key(bucket_name='mybucket', key_name='mykey', contents=None):
     # fake connection, bucket and key
     _LOGGER.debug('%r', locals())
-    conn = boto.connect_s3()
-    conn.create_bucket(bucket_name)
-    mybucket = conn.get_bucket(bucket_name)
-    mykey = boto.s3.key.Key()
-    mykey.name = key_name
-    mykey.bucket = mybucket
+    s3 = boto3.resource('s3')
+    mybucket = s3.create_bucket(Bucket=bucket_name)
+    mykey = s3.Object(bucket_name, key_name)
     if contents is not None:
-        _LOGGER.debug('len(contents): %r', len(contents))
-        mykey.set_contents_from_string(contents)
+        mykey.put(Body=contents)
     return mybucket, mykey
 
 
@@ -47,7 +43,7 @@ def test_iter(self):
         """Are S3 files iterated over correctly?"""
         # a list of strings to test with
         expected = u"hello wořld\nhow are you?".encode('utf8')
-        bucket, key = create_bucket_and_key(contents=expected)
+        create_bucket_and_key(contents=expected)
 
         # connect to fake s3 and read from the fake key we filled above
         fin = smart_open.s3.BufferedInputBase('mybucket', 'mykey')
@@ -57,15 +53,15 @@ def test_iter(self):
     def test_iter_context_manager(self):
         # same thing but using a context manager
         expected = u"hello wořld\nhow are you?".encode('utf8')
-        bucket, key = create_bucket_and_key(contents=expected)
+        create_bucket_and_key(contents=expected)
         with smart_open.s3.BufferedInputBase('mybucket', 'mykey') as fin:
             output = [line.rstrip(b'\n') for line in fin]
             self.assertEqual(output, expected.split(b'\n'))
 
     def test_read(self):
         """Are S3 files read correctly?"""
         content = u"hello wořld\nhow are you?".encode('utf8')
-        bucket, key = create_bucket_and_key(contents=content)
+        create_bucket_and_key(contents=content)
         _LOGGER.debug('content: %r len: %r', content, len(content))
 
         fin = smart_open.s3.BufferedInputBase('mybucket', 'mykey')
@@ -76,7 +72,7 @@ def test_read(self):
     def test_seek_beginning(self):
         """Does seeking to the beginning of S3 files work correctly?"""
         content = u"hello wořld\nhow are you?".encode('utf8')
-        bucket, key = create_bucket_and_key(contents=content)
+        create_bucket_and_key(contents=content)
 
         fin = smart_open.s3.BufferedInputBase('mybucket', 'mykey')
         self.assertEqual(content[:6], fin.read(6))
@@ -91,7 +87,7 @@ def test_seek_beginning(self):
     def test_seek_start(self):
         """Does seeking from the start of S3 files work correctly?"""
         content = u"hello wořld\nhow are you?".encode('utf8')
-        bucket, key = create_bucket_and_key(contents=content)
+        create_bucket_and_key(contents=content)
 
         fin = smart_open.s3.BufferedInputBase('mybucket', 'mykey')
         seek = fin.seek(6)
@@ -102,7 +98,7 @@ def test_seek_start(self):
     def test_seek_current(self):
         """Does seeking from the middle of S3 files work correctly?"""
         content = u"hello wořld\nhow are you?".encode('utf8')
-        bucket, key = create_bucket_and_key(contents=content)
+        create_bucket_and_key(contents=content)
 
         fin = smart_open.s3.BufferedInputBase('mybucket', 'mykey')
         self.assertEqual(fin.read(5), b'hello')
@@ -113,7 +109,7 @@ def test_seek_current(self):
     def test_seek_end(self):
         """Does seeking from the end of S3 files work correctly?"""
         content = u"hello wořld\nhow are you?".encode('utf8')
-        bucket, key = create_bucket_and_key(contents=content)
+        create_bucket_and_key(contents=content)
 
         fin = smart_open.s3.BufferedInputBase('mybucket', 'mykey')
         seek = fin.seek(-4, whence=smart_open.s3.END)
@@ -122,7 +118,7 @@ def test_seek_end(self):
 
     def test_detect_eof(self):
         content = u"hello wořld\nhow are you?".encode('utf8')
-        bucket, key = create_bucket_and_key(contents=content)
+        create_bucket_and_key(contents=content)
 
         fin = smart_open.s3.BufferedInputBase('mybucket', 'mykey')
         fin.read()
@@ -137,7 +133,7 @@ def test_read_gzip(self):
         buf.close = lambda: None  # keep buffer open so that we can .getvalue()
         with contextlib.closing(gzip.GzipFile(fileobj=buf, mode='w')) as zipfile:
             zipfile.write(expected)
-        bucket, key = create_bucket_and_key(contents=buf.getvalue())
+        create_bucket_and_key(contents=buf.getvalue())
 
         #
         # Make sure we're reading things correctly.
@@ -159,6 +155,26 @@ def test_read_gzip(self):
 
         self.assertEqual(expected, actual)
 
+    def test_readline(self):
+        content = b'englishman\nin\nnew\nyork\n'
+        create_bucket_and_key(contents=content)
+
+        with smart_open.s3.BufferedInputBase('mybucket', 'mykey') as fin:
+            actual = list(fin)
+
+        expected = [b'englishman\n', b'in\n', b'new\n', b'york\n']
+        self.assertEqual(expected, actual)
+
+    def test_readline_tiny_buffer(self):
+        content = b'englishman\nin\nnew\nyork\n'
+        create_bucket_and_key(contents=content)
+
+        with smart_open.s3.BufferedInputBase('mybucket', 'mykey', buffer_size=8) as fin:
+            actual = list(fin)
+
+        expected = [b'englishman\n', b'in\n', b'new\n', b'york\n']
+        self.assertEqual(expected, actual)
+
 
 @moto.mock_s3
 class BufferedOutputBaseTest(unittest.TestCase):
@@ -168,7 +184,7 @@ class BufferedOutputBaseTest(unittest.TestCase):
     """
     def test_write_01(self):
         """Does writing into s3 work correctly?"""
-        mybucket, mykey = create_bucket_and_key()
+        create_bucket_and_key()
         test_string = u"žluťoučký koníček".encode('utf8')
 
         # write into key
@@ -182,7 +198,7 @@ def test_write_01(self):
 
     def test_write_01a(self):
         """Does s3 write fail on incorrect input?"""
-        mybucket, mykey = create_bucket_and_key()
+        create_bucket_and_key()
 
         try:
             with smart_open.s3.BufferedOutputBase('mybucket', 'writekey') as fin:
@@ -194,7 +210,7 @@ def test_write_01a(self):
 
     def test_write_02(self):
         """Does s3 write unicode-utf8 conversion work?"""
-        mybucket, mykey = create_bucket_and_key()
+        create_bucket_and_key()
 
         smart_open_write = smart_open.s3.BufferedOutputBase('mybucket', 'writekey')
         smart_open_write.tell()
@@ -205,7 +221,7 @@ def test_write_02(self):
 
     def test_write_03(self):
         """Does s3 multipart chunking work correctly?"""
-        mybucket, mykey = create_bucket_and_key()
+        create_bucket_and_key()
 
         # write
         smart_open_write = smart_open.s3.BufferedOutputBase(