openlawlibrary · danixeee · May 14, 2020 · May 2, 2020 · May 2, 2020 · May 3, 2020
diff --git a/pygls/workspace.py b/pygls/workspace.py
@@ -33,6 +33,58 @@
 log = logging.getLogger(__name__)
 
 
+def position_to_rowcol(lines: List[str], position: Position) -> tuple:
+    """Convert a LSP position into a row, column pair.
+
+    This method converts the Position's character offset
+    from UTF-16 code units to UTF-32 code points.
+
+    The offset of the closing quotation mark in x="😋" is
+    - 5 in UTF-16 representation
+    - 4 in UTF-32 representation
+
+    A python application can't use the character memeber of `Position`
+    directly as per specification it is represented as a zero-based line and
+    character offset based based on a UTF-16 string representation.
+
+    All characters whose codepoint exeeds the Basic Multilingual Plane are
+    represented by 2 UTF-16 code units.
+
+    see: https://github.com/microsoft/language-server-protocol/issues/376
+    """
+    row = len(lines)
+    col = 0
+    if row > position.line:
+        row = position.line
+        col = position.character
+        for ch in lines[row][:position.character]:
+            if ord(ch) > 0xFFFF:
+                col -= 1
+    return (row, col)
+
+
+def rowcol_to_position(lines: List[str], row: int, col: int) -> Position:
+    """Convert a row, column pair into a LSP Position.
+
+    This method converts the `col` argument from UTF-32 code points to
+    to UTF-16 code units and returns a `Position` object.
+
+    A python application can't use the character memeber of `Position`
+    directly as per specification it is represented as a zero-based line and
+    character offset based based on a UTF-16 string representation.
+
+    All characters whose codepoint exeeds the Basic Multilingual Plane are
+    represented by 2 UTF-16 code units.
+    """
+    line = len(lines)
+    character = 0
+    if line > row:
+        line = row
+        character = sum(1 + int(ord(ch) > 0xFFFF) for ch in lines[line][:col])
+
+    return Position(line, character)
+
+
 class Document(object):
 
     def __init__(self, uri, source=None, version=None, local=True,
@@ -54,16 +106,15 @@ def __str__(self):
 
     def _apply_incremental_change(self, change: TextDocumentContentChangeEvent) -> None:
         """Apply an INCREMENTAL text change to the document"""
+        lines = self.lines
         text = change.text
         change_range = change.range
 
-        start_line = change_range.start.line
-        start_col = change_range.start.character
-        end_line = change_range.end.line
-        end_col = change_range.end.character
+        start_line, start_col = position_to_rowcol(lines, change_range.start)
+        end_line, end_col = position_to_rowcol(lines, change_range.end)
 
         # Check for an edit occuring at the very end of the file
-        if start_line == len(self.lines):
+        if start_line == len(lines):
             self._source = self.source + text
             return
 
@@ -72,7 +123,7 @@ def _apply_incremental_change(self, change: TextDocumentContentChangeEvent) -> N
         # Iterate over the existing document until we hit the edit range,
         # at which point we write the new text, then loop until we hit
         # the end of the range and continue writing.
-        for i, line in enumerate(self.lines):
+        for i, line in enumerate(lines):
             if i < start_line:
                 new.write(line)
                 continue
@@ -150,9 +201,17 @@ def apply_change(self, change: TextDocumentContentChangeEvent) -> None:
     def lines(self) -> List[str]:
         return self.source.splitlines(True)
 
+    def position_to_rowcol(self, position: Position) -> tuple:
+        return position_to_rowcol(self.lines, position)
+
+    def rowcol_to_position(self, row: int, col: int) -> Position:
+        return rowcol_to_position(self.lines, row, col)
+
     def offset_at_position(self, position: Position) -> int:
-        """Return the byte-offset pointed at by the given position."""
-        return position.character + len(''.join(self.lines[:position.line]))
+        """Return the character offset pointed at by the given position."""
+        lines = self.lines
+        row, col = position_to_rowcol(lines, position)
+        return col + sum(len(line) for line in lines[:row])
 
     @property
     def source(self) -> str:
@@ -165,14 +224,15 @@ def word_at_position(self, position: Position) -> str:
         """
         Get the word under the cursor returning the start and end positions.
         """
-        if position.line >= len(self.lines):
+        lines = self.lines
+        if position.line >= len(lines):
             return ''
 
-        line = self.lines[position.line]
-        i = position.character
+        row, col = position_to_rowcol(lines, position)
+        line = lines[row]
         # Split word in two
-        start = line[:i]
-        end = line[i:]
+        start = line[:col]
+        end = line[col:]
 
         # Take end of start and start of end to find word
         # These are guaranteed to match, even if they match the empty string

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -32,6 +32,7 @@
 DOC = """document
 for
 testing
+with "😋" unicode.
 """
 DOC_URI = uris.from_fs_path(__file__)
 

diff --git a/tests/test_document.py b/tests/test_document.py
@@ -74,7 +74,7 @@ def test_document_line_edit():
 
 
 def test_document_lines(doc):
-    assert len(doc.lines) == 3
+    assert len(doc.lines) == 4
     assert doc.lines[0] == 'document\n'
 
 
@@ -125,7 +125,11 @@ def test_offset_at_position(doc):
     assert doc.offset_at_position(Position(1, 5)) == 14
     assert doc.offset_at_position(Position(2, 0)) == 13
     assert doc.offset_at_position(Position(2, 4)) == 17
-    assert doc.offset_at_position(Position(4, 0)) == 21
+    assert doc.offset_at_position(Position(3, 6)) == 27
+    assert doc.offset_at_position(Position(3, 7)) == 27
+    assert doc.offset_at_position(Position(3, 8)) == 28
+    assert doc.offset_at_position(Position(4, 0)) == 39
+    assert doc.offset_at_position(Position(5, 0)) == 39
 
 
 def test_word_at_position(doc):
@@ -136,4 +140,5 @@ def test_word_at_position(doc):
     assert doc.word_at_position(Position(0, 1000)) == 'document'
     assert doc.word_at_position(Position(1, 5)) == 'for'
     assert doc.word_at_position(Position(2, 0)) == 'testing'
+    assert doc.word_at_position(Position(3, 10)) == 'unicode'
     assert doc.word_at_position(Position(4, 0)) == ''
-Original file line number
+Diff line change
@@ Expand Up / @@ -32,6 +32,7 @@ @@
     DOC = """document
     for
     testing
+    with "😋" unicode.
     """
     DOC_URI = uris.from_fs_path(__file__)
@@ Expand Down @@