From de523457fbc4aced4076f0ef61e5fb9e5f338b60 Mon Sep 17 00:00:00 2001 From: Oliver Mannion <125105+tekumara@users.noreply.github.com> Date: Tue, 26 Dec 2023 16:11:27 +1100 Subject: [PATCH] fix: count positions as utf-16 code units resolves #22 --- Cargo.lock | 1 - crates/typos-lsp/Cargo.toml | 1 - crates/typos-lsp/src/lsp.rs | 10 +++++++--- crates/typos-lsp/tests/integration_test.rs | 17 ++++++++++++----- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 172618d..6a9f71c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1626,7 +1626,6 @@ dependencies = [ "tracing-subscriber", "typos", "typos-cli", - "unicode-segmentation", ] [[package]] diff --git a/crates/typos-lsp/Cargo.toml b/crates/typos-lsp/Cargo.toml index 8687c0d..13251ad 100644 --- a/crates/typos-lsp/Cargo.toml +++ b/crates/typos-lsp/Cargo.toml @@ -19,7 +19,6 @@ serde = { version = "1.0", features = ["derive"] } ignore = "0.4.20" matchit = "0.7.1" shellexpand = "3.1.0" -unicode-segmentation = "1.10.1" regex = "1.10.2" once_cell = "1.19.0" diff --git a/crates/typos-lsp/src/lsp.rs b/crates/typos-lsp/src/lsp.rs index 2433032..367642e 100644 --- a/crates/typos-lsp/src/lsp.rs +++ b/crates/typos-lsp/src/lsp.rs @@ -192,6 +192,8 @@ impl LanguageServer for Backend<'static, 'static> { Ok(InitializeResult { capabilities: ServerCapabilities { + // only support UTF-16 positions for now, which is the default when unspecified + position_encoding: Some(PositionEncodingKind::UTF16), text_document_sync: Some(TextDocumentSyncCapability::Kind( // TODO: should we support incremental? TextDocumentSyncKind::FULL, @@ -459,9 +461,11 @@ impl AccumulatePosition { .unwrap_or(0); let before_typo = String::from_utf8_lossy(&buffer[line_start..byte_offset]); - let line_pos = - unicode_segmentation::UnicodeSegmentation::graphemes(before_typo.as_ref(), true) - .count(); + + // count UTF-16 code units as per + // https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#textDocuments + // UTF-16 is the only position encoding we support for now + let line_pos = before_typo.chars().map(char::len_utf16).sum(); self.line_num = line_num; self.line_pos = line_pos; diff --git a/crates/typos-lsp/tests/integration_test.rs b/crates/typos-lsp/tests/integration_test.rs index a48b65a..d595bc8 100644 --- a/crates/typos-lsp/tests/integration_test.rs +++ b/crates/typos-lsp/tests/integration_test.rs @@ -20,6 +20,7 @@ async fn test_initialize_e2e() { "codeActionKinds": ["quickfix"], "workDoneProgress": false }, + "positionEncoding": "utf-16", "textDocumentSync": 1, "workspace": { "workspaceFolders": { "changeNotifications": true, "supported": true } @@ -218,17 +219,23 @@ async fn test_custom_config_file() { } #[test_log::test(tokio::test)] -async fn test_unicode_diagnostics() { - let did_open = &did_open("¿Qué hace él?"); - +async fn test_position_with_unicode_text() { let mut server = TestServer::new(); let _ = server.request(&initialize()).await; - // start position should count graphemes with multiple code points as one visible character + // ¿ and é are two-byte code points in utf-8 + let unicode_text = &did_open("¿Qué hace él?"); similar_asserts::assert_eq!( - server.request(&did_open).await, + server.request(&unicode_text).await, publish_diagnostics(&[diag("`hace` should be `have`", 0, 5, 9)]) ); + + // ẽ has two code points U+0065 U+0303 (latin small letter e, combining tilde) + let unicode_text = &did_open("ẽ hace"); + similar_asserts::assert_eq!( + server.request(&unicode_text).await, + publish_diagnostics(&[diag("`hace` should be `have`", 0, 3, 7)]) + ); } fn initialize() -> String {