Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix utf-16 to utf-32 character offsets. #117

Merged
merged 10 commits into from
May 14, 2020
86 changes: 73 additions & 13 deletions pygls/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,58 @@
log = logging.getLogger(__name__)


def position_to_rowcol(lines: List[str], position: Position) -> tuple:
"""Convert a LSP position into a row, column pair.

This method converts the Position's character offset
from UTF-16 code units to UTF-32 code points.

The offset of the closing quotation mark in x="😋" is
- 5 in UTF-16 representation
- 4 in UTF-32 representation

A python application can't use the character memeber of `Position`
deathaxe marked this conversation as resolved.
Show resolved Hide resolved
directly as per specification it is represented as a zero-based line and
character offset based based on a UTF-16 string representation.
deathaxe marked this conversation as resolved.
Show resolved Hide resolved

All characters whose codepoint exeeds the Basic Multilingual Plane are
deathaxe marked this conversation as resolved.
Show resolved Hide resolved
represented by 2 UTF-16 code units.

see: https://github.com/microsoft/language-server-protocol/issues/376
"""
row = len(lines)
col = 0
if row > position.line:
row = position.line
col = position.character
for ch in lines[row][:position.character]:
if ord(ch) > 0xFFFF:
col -= 1
return (row, col)


def rowcol_to_position(lines: List[str], row: int, col: int) -> Position:
"""Convert a row, column pair into a LSP Position.

This method converts the `col` argument from UTF-32 code points to
to UTF-16 code units and returns a `Position` object.

A python application can't use the character memeber of `Position`
deathaxe marked this conversation as resolved.
Show resolved Hide resolved
directly as per specification it is represented as a zero-based line and
character offset based based on a UTF-16 string representation.

All characters whose codepoint exeeds the Basic Multilingual Plane are
deathaxe marked this conversation as resolved.
Show resolved Hide resolved
represented by 2 UTF-16 code units.
"""
line = len(lines)
character = 0
if line > row:
line = row
character = sum(1 + int(ord(ch) > 0xFFFF) for ch in lines[line][:col])

return Position(line, character)


class Document(object):
deathaxe marked this conversation as resolved.
Show resolved Hide resolved

def __init__(self, uri, source=None, version=None, local=True,
Expand All @@ -54,16 +106,15 @@ def __str__(self):

def _apply_incremental_change(self, change: TextDocumentContentChangeEvent) -> None:
"""Apply an INCREMENTAL text change to the document"""
lines = self.lines
text = change.text
change_range = change.range

start_line = change_range.start.line
start_col = change_range.start.character
end_line = change_range.end.line
end_col = change_range.end.character
start_line, start_col = position_to_rowcol(lines, change_range.start)
end_line, end_col = position_to_rowcol(lines, change_range.end)

# Check for an edit occuring at the very end of the file
if start_line == len(self.lines):
if start_line == len(lines):
self._source = self.source + text
return

Expand All @@ -72,7 +123,7 @@ def _apply_incremental_change(self, change: TextDocumentContentChangeEvent) -> N
# Iterate over the existing document until we hit the edit range,
# at which point we write the new text, then loop until we hit
# the end of the range and continue writing.
for i, line in enumerate(self.lines):
for i, line in enumerate(lines):
if i < start_line:
new.write(line)
continue
Expand Down Expand Up @@ -150,9 +201,17 @@ def apply_change(self, change: TextDocumentContentChangeEvent) -> None:
def lines(self) -> List[str]:
return self.source.splitlines(True)

def position_to_rowcol(self, position: Position) -> tuple:
return position_to_rowcol(self.lines, position)

def rowcol_to_position(self, row: int, col: int) -> Position:
return rowcol_to_position(self.lines, row, col)

def offset_at_position(self, position: Position) -> int:
"""Return the byte-offset pointed at by the given position."""
return position.character + len(''.join(self.lines[:position.line]))
"""Return the character offset pointed at by the given position."""
lines = self.lines
row, col = position_to_rowcol(lines, position)
return col + sum(len(line) for line in lines[:row])

@property
def source(self) -> str:
Expand All @@ -165,14 +224,15 @@ def word_at_position(self, position: Position) -> str:
"""
Get the word under the cursor returning the start and end positions.
"""
if position.line >= len(self.lines):
lines = self.lines
if position.line >= len(lines):
return ''

line = self.lines[position.line]
i = position.character
row, col = position_to_rowcol(lines, position)
line = lines[row]
# Split word in two
start = line[:i]
end = line[i:]
start = line[:col]
end = line[col:]

# Take end of start and start of end to find word
# These are guaranteed to match, even if they match the empty string
Expand Down
1 change: 1 addition & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
DOC = """document
for
testing
with "😋" unicode.
"""
DOC_URI = uris.from_fs_path(__file__)

Expand Down
9 changes: 7 additions & 2 deletions tests/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_document_line_edit():


def test_document_lines(doc):
assert len(doc.lines) == 3
assert len(doc.lines) == 4
assert doc.lines[0] == 'document\n'


Expand Down Expand Up @@ -125,7 +125,11 @@ def test_offset_at_position(doc):
assert doc.offset_at_position(Position(1, 5)) == 14
assert doc.offset_at_position(Position(2, 0)) == 13
assert doc.offset_at_position(Position(2, 4)) == 17
assert doc.offset_at_position(Position(4, 0)) == 21
assert doc.offset_at_position(Position(3, 6)) == 27
assert doc.offset_at_position(Position(3, 7)) == 27
assert doc.offset_at_position(Position(3, 8)) == 28
assert doc.offset_at_position(Position(4, 0)) == 39
assert doc.offset_at_position(Position(5, 0)) == 39


def test_word_at_position(doc):
Expand All @@ -136,4 +140,5 @@ def test_word_at_position(doc):
assert doc.word_at_position(Position(0, 1000)) == 'document'
assert doc.word_at_position(Position(1, 5)) == 'for'
assert doc.word_at_position(Position(2, 0)) == 'testing'
assert doc.word_at_position(Position(3, 10)) == 'unicode'
assert doc.word_at_position(Position(4, 0)) == ''