From 0cb76b9b5406b106b448436824019a099d7ef310 Mon Sep 17 00:00:00 2001 From: Jane Lewis Date: Fri, 23 Feb 2024 22:27:02 -0800 Subject: [PATCH] Create the `ruff_server` crate and the `ruff server` command. --- Cargo.lock | 137 +- Cargo.toml | 7 + crates/ruff/Cargo.toml | 3 + crates/ruff/src/args.rs | 5 + crates/ruff/src/commands/mod.rs | 1 + crates/ruff/src/commands/server.rs | 69 + crates/ruff/src/lib.rs | 8 + crates/ruff_formatter/src/lib.rs | 4 + crates/ruff_server/Cargo.toml | 46 + crates/ruff_server/README.md | 2 + .../resources/test/fixtures/pandas_html.py | 1240 +++++++++++++++++ crates/ruff_server/src/edit.rs | 48 + crates/ruff_server/src/edit/document.rs | 120 ++ crates/ruff_server/src/edit/range.rs | 148 ++ crates/ruff_server/src/format.rs | 29 + crates/ruff_server/src/lib.rs | 23 + crates/ruff_server/src/lint.rs | 119 ++ crates/ruff_server/src/server.rs | 152 ++ crates/ruff_server/src/server/api.rs | 148 ++ .../src/server/api/notifications.rs | 17 + .../src/server/api/notifications/cancel.rs | 22 + .../server/api/notifications/did_change.rs | 39 + .../notifications/did_change_configuration.rs | 22 + .../api/notifications/did_change_workspace.rs | 32 + .../src/server/api/notifications/did_close.rs | 26 + .../src/server/api/notifications/did_open.rs | 31 + crates/ruff_server/src/server/api/requests.rs | 15 + .../src/server/api/requests/code_action.rs | 69 + .../src/server/api/requests/diagnostic.rs | 38 + .../src/server/api/requests/format.rs | 36 + .../src/server/api/requests/format_range.rs | 40 + crates/ruff_server/src/server/api/traits.rs | 142 ++ crates/ruff_server/src/server/client.rs | 77 + crates/ruff_server/src/server/schedule.rs | 96 ++ .../ruff_server/src/server/schedule/task.rs | 111 ++ .../ruff_server/src/server/schedule/thread.rs | 109 ++ .../src/server/schedule/thread/pool.rs | 106 ++ .../src/server/schedule/thread/priority.rs | 300 ++++ crates/ruff_server/src/session.rs | 402 ++++++ crates/ruff_server/src/session/types.rs | 35 + crates/ruff_server/tests/document.rs | 91 ++ .../document__delete_lines_pandas_html.snap | 1233 ++++++++++++++++ crates/ruff_source_file/src/line_index.rs | 5 + 43 files changed, 5402 insertions(+), 1 deletion(-) create mode 100644 crates/ruff/src/commands/server.rs create mode 100644 crates/ruff_server/Cargo.toml create mode 100644 crates/ruff_server/README.md create mode 100644 crates/ruff_server/resources/test/fixtures/pandas_html.py create mode 100644 crates/ruff_server/src/edit.rs create mode 100644 crates/ruff_server/src/edit/document.rs create mode 100644 crates/ruff_server/src/edit/range.rs create mode 100644 crates/ruff_server/src/format.rs create mode 100644 crates/ruff_server/src/lib.rs create mode 100644 crates/ruff_server/src/lint.rs create mode 100644 crates/ruff_server/src/server.rs create mode 100644 crates/ruff_server/src/server/api.rs create mode 100644 crates/ruff_server/src/server/api/notifications.rs create mode 100644 crates/ruff_server/src/server/api/notifications/cancel.rs create mode 100644 crates/ruff_server/src/server/api/notifications/did_change.rs create mode 100644 crates/ruff_server/src/server/api/notifications/did_change_configuration.rs create mode 100644 crates/ruff_server/src/server/api/notifications/did_change_workspace.rs create mode 100644 crates/ruff_server/src/server/api/notifications/did_close.rs create mode 100644 crates/ruff_server/src/server/api/notifications/did_open.rs create mode 100644 crates/ruff_server/src/server/api/requests.rs create mode 100644 crates/ruff_server/src/server/api/requests/code_action.rs create mode 100644 crates/ruff_server/src/server/api/requests/diagnostic.rs create mode 100644 crates/ruff_server/src/server/api/requests/format.rs create mode 100644 crates/ruff_server/src/server/api/requests/format_range.rs create mode 100644 crates/ruff_server/src/server/api/traits.rs create mode 100644 crates/ruff_server/src/server/client.rs create mode 100644 crates/ruff_server/src/server/schedule.rs create mode 100644 crates/ruff_server/src/server/schedule/task.rs create mode 100644 crates/ruff_server/src/server/schedule/thread.rs create mode 100644 crates/ruff_server/src/server/schedule/thread/pool.rs create mode 100644 crates/ruff_server/src/server/schedule/thread/priority.rs create mode 100644 crates/ruff_server/src/session.rs create mode 100644 crates/ruff_server/src/session/types.rs create mode 100644 crates/ruff_server/tests/document.rs create mode 100644 crates/ruff_server/tests/snapshots/document__delete_lines_pandas_html.snap diff --git a/Cargo.lock b/Cargo.lock index 01615ccf6d37dd..19ac168cd52790 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -531,6 +531,19 @@ dependencies = [ "itertools 0.10.5", ] +[[package]] +name = "crossbeam" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + [[package]] name = "crossbeam-channel" version = "0.5.11" @@ -559,6 +572,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-queue" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df0346b5d5e76ac2fe4e327c5fd1118d6be7c51dfb18f9b7922923f287471e35" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.19" @@ -1149,6 +1171,12 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +[[package]] +name = "jod-thread" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b23360e99b8717f20aaa4598f5a6541efbe30630039fbc7706cf954a87947ae" + [[package]] name = "js-sys" version = "0.3.68" @@ -1322,6 +1350,31 @@ version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +[[package]] +name = "lsp-server" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248f65b78f6db5d8e1b1604b4098a28b43d21a8eb1deeca22b1c421b276c7095" +dependencies = [ + "crossbeam-channel", + "log", + "serde", + "serde_json", +] + +[[package]] +name = "lsp-types" +version = "0.95.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "158c1911354ef73e8fe42da6b10c0484cb65c7f1007f28022e847706c1ab6984" +dependencies = [ + "bitflags 1.3.2", + "serde", + "serde_json", + "serde_repr", + "url", +] + [[package]] name = "matchers" version = "0.1.0" @@ -1625,6 +1678,7 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" dependencies = [ + "phf_macros", "phf_shared 0.11.2", ] @@ -1648,6 +1702,19 @@ dependencies = [ "rand", ] +[[package]] +name = "phf_macros" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3444646e286606587e49f3bcf1679b8cef1dc2c5ecc29ddacaffc305180d464b" +dependencies = [ + "phf_generator", + "phf_shared 0.11.2", + "proc-macro2", + "quote", + "syn 2.0.49", +] + [[package]] name = "phf_shared" version = "0.10.0" @@ -1982,6 +2049,7 @@ dependencies = [ "ruff_notebook", "ruff_python_ast", "ruff_python_formatter", + "ruff_server", "ruff_source_file", "ruff_text_size", "ruff_workspace", @@ -1996,6 +2064,8 @@ dependencies = [ "tikv-jemallocator", "toml", "tracing", + "tracing-subscriber", + "tracing-tree", "walkdir", "wild", ] @@ -2363,6 +2433,37 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "ruff_server" +version = "0.2.2" +dependencies = [ + "anyhow", + "crossbeam", + "insta", + "jod-thread", + "libc", + "lsp-server", + "lsp-types", + "notify", + "phf", + "ruff_diagnostics", + "ruff_formatter", + "ruff_linter", + "ruff_python_ast", + "ruff_python_codegen", + "ruff_python_formatter", + "ruff_python_index", + "ruff_python_parser", + "ruff_source_file", + "ruff_text_size", + "ruff_workspace", + "rustc-hash", + "serde", + "serde_json", + "similar", + "tracing", +] + [[package]] name = "ruff_shrinking" version = "0.2.2" @@ -2640,6 +2741,17 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_repr" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b2e6b945e9d3df726b65d6ee24060aff8e3533d431f677a9695db04eff9dfdb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.49", +] + [[package]] name = "serde_spanned" version = "0.6.5" @@ -3092,6 +3204,17 @@ dependencies = [ "tracing-subscriber", ] +[[package]] +name = "tracing-log" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f751112709b4e791d8ce53e32c4ed2d353565a795ce84da2285393f41557bdf2" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + [[package]] name = "tracing-log" version = "0.2.0" @@ -3118,7 +3241,19 @@ dependencies = [ "thread_local", "tracing", "tracing-core", - "tracing-log", + "tracing-log 0.2.0", +] + +[[package]] +name = "tracing-tree" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ec6adcab41b1391b08a308cc6302b79f8095d1673f6947c2dc65ffb028b0b2d" +dependencies = [ + "nu-ansi-term", + "tracing-core", + "tracing-log 0.1.4", + "tracing-subscriber", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index a9ba136409fa54..bd53fc94d9dd80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,6 +32,7 @@ console_error_panic_hook = { version = "0.1.7" } console_log = { version = "1.0.0" } countme = { version ="3.0.1"} criterion = { version = "0.5.1", default-features = false } +crossbeam = { version = "0.8.4" } dirs = { version = "5.0.0" } drop_bomb = { version = "0.1.5" } env_logger = { version ="0.10.1"} @@ -52,10 +53,14 @@ is-macro = { version = "0.3.5" } is-wsl = { version = "0.4.0" } itertools = { version = "0.12.1" } js-sys = { version = "0.3.67" } +jod-thread = { version = "0.1.2" } lalrpop-util = { version = "0.20.0", default-features = false } lexical-parse-float = { version = "0.8.0", features = ["format"] } +libc = { version = "0.2.153" } libcst = { version = "1.1.0", default-features = false } log = { version = "0.4.17" } +lsp-server = { version = "0.7.6" } +lsp-types = { version = "0.95.0", features = ["proposed"] } memchr = { version = "2.7.1" } mimalloc = { version ="0.1.39"} natord = { version = "1.0.9" } @@ -64,6 +69,7 @@ once_cell = { version = "1.19.0" } path-absolutize = { version = "3.1.1" } pathdiff = { version = "0.2.1" } pep440_rs = { version = "0.4.0", features = ["serde"] } +phf = { version = "0.11.2", features = ["macros"] } pretty_assertions = "1.3.0" proc-macro2 = { version = "1.0.78" } pyproject-toml = { version = "0.9.0" } @@ -98,6 +104,7 @@ toml = { version = "0.8.9" } tracing = { version = "0.1.40" } tracing-indicatif = { version = "0.3.6" } tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +tracing-tree = { version = "0.2.4" } typed-arena = { version = "2.0.2" } unic-ucd-category = { version ="0.9"} unicode-ident = { version = "1.0.12" } diff --git a/crates/ruff/Cargo.toml b/crates/ruff/Cargo.toml index 13dd5e31d9d562..076fa66fd8f14d 100644 --- a/crates/ruff/Cargo.toml +++ b/crates/ruff/Cargo.toml @@ -20,6 +20,7 @@ ruff_macros = { path = "../ruff_macros" } ruff_notebook = { path = "../ruff_notebook" } ruff_python_ast = { path = "../ruff_python_ast" } ruff_python_formatter = { path = "../ruff_python_formatter" } +ruff_server = { path = "../ruff_server" } ruff_source_file = { path = "../ruff_source_file" } ruff_text_size = { path = "../ruff_text_size" } ruff_workspace = { path = "../ruff_workspace" } @@ -52,6 +53,8 @@ tempfile = { workspace = true } thiserror = { workspace = true } toml = { workspace = true } tracing = { workspace = true, features = ["log"] } +tracing-subscriber = { workspace = true, features = ["registry"]} +tracing-tree = { workspace = true } walkdir = { workspace = true } wild = { workspace = true } diff --git a/crates/ruff/src/args.rs b/crates/ruff/src/args.rs index e7bb733a7956fd..52c65532e8d8b3 100644 --- a/crates/ruff/src/args.rs +++ b/crates/ruff/src/args.rs @@ -88,6 +88,8 @@ pub enum Command { GenerateShellCompletion { shell: clap_complete_command::Shell }, /// Run the Ruff formatter on the given files or directories. Format(FormatCommand), + /// Run the language server. + Server(ServerCommand), /// Display Ruff's version Version { #[arg(long, value_enum, default_value = "text")] @@ -506,6 +508,9 @@ pub struct FormatCommand { pub range: Option, } +#[derive(Clone, Debug, clap::Parser)] +pub struct ServerCommand; + #[derive(Debug, Clone, Copy, clap::ValueEnum)] pub enum HelpFormat { Text, diff --git a/crates/ruff/src/commands/mod.rs b/crates/ruff/src/commands/mod.rs index 554a7a454add24..787a22ed434510 100644 --- a/crates/ruff/src/commands/mod.rs +++ b/crates/ruff/src/commands/mod.rs @@ -7,6 +7,7 @@ pub(crate) mod format; pub(crate) mod format_stdin; pub(crate) mod linter; pub(crate) mod rule; +pub(crate) mod server; pub(crate) mod show_files; pub(crate) mod show_settings; pub(crate) mod version; diff --git a/crates/ruff/src/commands/server.rs b/crates/ruff/src/commands/server.rs new file mode 100644 index 00000000000000..5ca37ed2b50077 --- /dev/null +++ b/crates/ruff/src/commands/server.rs @@ -0,0 +1,69 @@ +use crate::ExitStatus; +use anyhow::Result; +use ruff_linter::logging::LogLevel; +use ruff_server::Server; +use tracing::{level_filters::LevelFilter, metadata::Level, subscriber::Interest, Metadata}; +use tracing_subscriber::{ + layer::{Context, Filter, SubscriberExt}, + Layer, Registry, +}; +use tracing_tree::time::Uptime; + +pub(crate) fn run_server(log_level: LogLevel) -> Result { + let trace_level = if log_level == LogLevel::Verbose { + Level::TRACE + } else { + Level::DEBUG + }; + + let subscriber = Registry::default().with( + tracing_tree::HierarchicalLayer::default() + .with_indent_lines(true) + .with_indent_amount(2) + .with_bracketed_fields(true) + .with_targets(true) + .with_writer(|| Box::new(std::io::stderr())) + .with_timer(Uptime::default()) + .with_filter(LoggingFilter { trace_level }), + ); + + tracing::subscriber::set_global_default(subscriber)?; + + let server = Server::new()?; + + server.run().map(|()| ExitStatus::Success) +} + +struct LoggingFilter { + trace_level: Level, +} + +impl LoggingFilter { + fn is_enabled(&self, meta: &Metadata<'_>) -> bool { + let filter = if meta.target().starts_with("ruff") { + self.trace_level + } else { + Level::INFO + }; + + meta.level() <= &filter + } +} + +impl Filter for LoggingFilter { + fn enabled(&self, meta: &Metadata<'_>, _cx: &Context<'_, S>) -> bool { + self.is_enabled(meta) + } + + fn callsite_enabled(&self, meta: &'static Metadata<'static>) -> Interest { + if self.is_enabled(meta) { + Interest::always() + } else { + Interest::never() + } + } + + fn max_level_hint(&self) -> Option { + Some(LevelFilter::from_level(self.trace_level)) + } +} diff --git a/crates/ruff/src/lib.rs b/crates/ruff/src/lib.rs index f2414af7b99746..0eee6de4a7b080 100644 --- a/crates/ruff/src/lib.rs +++ b/crates/ruff/src/lib.rs @@ -7,6 +7,7 @@ use std::process::ExitCode; use std::sync::mpsc::channel; use anyhow::Result; +use args::ServerCommand; use clap::CommandFactory; use colored::Colorize; use log::warn; @@ -200,6 +201,7 @@ pub fn run( } Command::Check(args) => check(args, log_level), Command::Format(args) => format(args, log_level), + Command::Server(args) => server(args, log_level), } } @@ -213,6 +215,12 @@ fn format(args: FormatCommand, log_level: LogLevel) -> Result { } } +#[allow(clippy::needless_pass_by_value)] // TODO: remove once we start taking arguments from here +fn server(args: ServerCommand, log_level: LogLevel) -> Result { + let ServerCommand {} = args; + commands::server::run_server(log_level) +} + pub fn check(args: CheckCommand, log_level: LogLevel) -> Result { let (cli, config_arguments) = args.partition()?; diff --git a/crates/ruff_formatter/src/lib.rs b/crates/ruff_formatter/src/lib.rs index a78cfab1ebc0a6..7f20543ee07369 100644 --- a/crates/ruff_formatter/src/lib.rs +++ b/crates/ruff_formatter/src/lib.rs @@ -545,6 +545,10 @@ impl PrintedRange { &self.code } + pub fn into_code(self) -> String { + self.code + } + /// The range the formatted code corresponds to in the source document. pub fn source_range(&self) -> TextRange { self.source_range diff --git a/crates/ruff_server/Cargo.toml b/crates/ruff_server/Cargo.toml new file mode 100644 index 00000000000000..3035e8397d61b1 --- /dev/null +++ b/crates/ruff_server/Cargo.toml @@ -0,0 +1,46 @@ +[package] +name = "ruff_server" +version = "0.2.2" +publish = false +authors = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } +repository = { workspace = true } +license = { workspace = true } + +[lib] + +[dependencies] +ruff_diagnostics = { path = "../ruff_diagnostics" } +ruff_formatter = { path = "../ruff_formatter" } +ruff_linter = { path = "../ruff_linter" } +ruff_python_ast = { path = "../ruff_python_ast" } +ruff_python_codegen = { path = "../ruff_python_codegen" } +ruff_python_formatter = { path = "../ruff_python_formatter" } +ruff_python_index = { path = "../ruff_python_index" } +ruff_python_parser = { path = "../ruff_python_parser" } +ruff_source_file = { path = "../ruff_source_file" } +ruff_text_size = { path = "../ruff_text_size" } +ruff_workspace = { path = "../ruff_workspace" } + +anyhow = { workspace = true } +crossbeam = { workspace = true } +jod-thread = { workspace = true } +libc = { workspace = true } +lsp-server = { workspace = true } +lsp-types = { workspace = true } +notify = { workspace = true } +phf = { workspace = true } +rustc-hash = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +similar = { workspace = true } +tracing = { workspace = true } + +[dev-dependencies] +insta = { workspace = true } + +[lints] +workspace = true diff --git a/crates/ruff_server/README.md b/crates/ruff_server/README.md new file mode 100644 index 00000000000000..4dfbcb50eb0d68 --- /dev/null +++ b/crates/ruff_server/README.md @@ -0,0 +1,2 @@ +## The Ruff Language Server + diff --git a/crates/ruff_server/resources/test/fixtures/pandas_html.py b/crates/ruff_server/resources/test/fixtures/pandas_html.py new file mode 100644 index 00000000000000..a4669fa1feff05 --- /dev/null +++ b/crates/ruff_server/resources/test/fixtures/pandas_html.py @@ -0,0 +1,1240 @@ +# +------------------------------------------------------------+ +# | Code adopted from: | +# | Repository: https://github.com/pandas-dev/pandas.git | +# | File: `io/html.py` | +# | Commit: 1f622e2b5303650fa5e497e4552d0554e51049cb | +# +------------------------------------------------------------+ +# This file should be used to test LSP functions that edit / fix a file. + +""" +:mod:`pandas.io.html` is a module containing functionality for dealing with +HTML IO. + +""" + +from __future__ import annotations + +from collections import abc +import errno +import numbers +import os +import re +from re import Pattern +from typing import ( + TYPE_CHECKING, + Literal, + cast, +) + +from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + AbstractMethodError, + EmptyDataError, +) +from pandas.util._decorators import doc +from pandas.util._validators import check_dtype_backend + +from pandas.core.dtypes.common import is_list_like + +from pandas import isna +from pandas.core.indexes.base import Index +from pandas.core.indexes.multi import MultiIndex +from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + get_handle, + is_url, + stringify_path, + validate_header_arg, +) +from pandas.io.formats.printing import pprint_thing +from pandas.io.parsers import TextParser + +if TYPE_CHECKING: + from collections.abc import ( + Iterable, + Sequence, + ) + + from pandas._typing import ( + BaseBuffer, + DtypeBackend, + FilePath, + HTMLFlavors, + ReadBuffer, + StorageOptions, + ) + + from pandas import DataFrame + +############# +# READ HTML # +############# +_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") + + +def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str: + """ + Replace extra whitespace inside of a string with a single space. + + Parameters + ---------- + s : str or unicode + The string from which to remove extra whitespace. + regex : re.Pattern + The regular expression to use to remove extra whitespace. + + Returns + ------- + subd : str or unicode + `s` with all extra whitespace replaced with a single space. + """ + return regex.sub(" ", s.strip()) + + +def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]: + """ + Get an iterator given an integer, slice or container. + + Parameters + ---------- + skiprows : int, slice, container + The iterator to use to skip rows; can also be a slice. + + Raises + ------ + TypeError + * If `skiprows` is not a slice, integer, or Container + + Returns + ------- + it : iterable + A proper iterator to use to skip rows of a DataFrame. + """ + if isinstance(skiprows, slice): + start, step = skiprows.start or 0, skiprows.step or 1 + return list(range(start, skiprows.stop, step)) + elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): + return cast("int | Sequence[int]", skiprows) + elif skiprows is None: + return 0 + raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") + + +def _read( + obj: FilePath | BaseBuffer, + encoding: str | None, + storage_options: StorageOptions | None, +) -> str | bytes: + """ + Try to read from a url, file or string. + + Parameters + ---------- + obj : str, unicode, path object, or file-like object + + Returns + ------- + raw_text : str + """ + try: + with get_handle( + obj, "r", encoding=encoding, storage_options=storage_options + ) as handles: + return handles.handle.read() + except OSError as err: + if not is_url(obj): + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}" + ) from err + raise + + +class _HtmlFrameParser: + """ + Base class for parsers that parse HTML into DataFrames. + + Parameters + ---------- + io : str or file-like + This can be either a string path, a valid URL using the HTTP, + FTP, or FILE protocols or a file-like object. + + match : str or regex + The text to match in the document. + + attrs : dict + List of HTML element attributes to match. + + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + Attributes + ---------- + io : str or file-like + raw HTML, URL, or file-like object + + match : regex + The text to match in the raw HTML + + attrs : dict-like + A dictionary of valid table attributes to use to search for table + elements. + + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + Notes + ----- + To subclass this class effectively you must override the following methods: + * :func:`_build_doc` + * :func:`_attr_getter` + * :func:`_href_getter` + * :func:`_text_getter` + * :func:`_parse_td` + * :func:`_parse_thead_tr` + * :func:`_parse_tbody_tr` + * :func:`_parse_tfoot_tr` + * :func:`_parse_tables` + * :func:`_equals_tag` + See each method's respective documentation for details on their + functionality. + """ + + def __init__( + self, + io: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + match: str | Pattern, + attrs: dict[str, str] | None, + encoding: str, + displayed_only: bool, + extract_links: Literal[None, "header", "footer", "body", "all"], + storage_options: StorageOptions = None, + ) -> None: + self.io = io + self.match = match + self.attrs = attrs + self.encoding = encoding + self.displayed_only = displayed_only + self.extract_links = extract_links + self.storage_options = storage_options + + def parse_tables(self): + """ + Parse and return all tables from the DOM. + + Returns + ------- + list of parsed (header, body, footer) tuples from tables. + """ + tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + return (self._parse_thead_tbody_tfoot(table) for table in tables) + + def _attr_getter(self, obj, attr): + """ + Return the attribute value of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + attr : str or unicode + The attribute, such as "colspan" + + Returns + ------- + str or unicode + The attribute value. + """ + # Both lxml and BeautifulSoup have the same implementation: + return obj.get(attr) + + def _href_getter(self, obj) -> str | None: + """ + Return a href if the DOM node contains a child or None. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + href : str or unicode + The href from the child of the DOM node. + """ + raise AbstractMethodError(self) + + def _text_getter(self, obj): + """ + Return the text of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + text : str or unicode + The text from an individual DOM node. + """ + raise AbstractMethodError(self) + + def _parse_td(self, obj): + """ + Return the td elements from a row element. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + list of node-like + These are the elements of each row, i.e., the columns. + """ + raise AbstractMethodError(self) + + def _parse_thead_tr(self, table): + """ + Return the list of thead row elements from the parsed table element. + + Parameters + ---------- + table : a table element that contains zero or more thead elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tbody_tr(self, table): + """ + Return the list of tbody row elements from the parsed table element. + + HTML5 table bodies consist of either 0 or more elements (which + only contain elements) or 0 or more elements. This method + checks for both structures. + + Parameters + ---------- + table : a table element that contains row elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tfoot_tr(self, table): + """ + Return the list of tfoot row elements from the parsed table element. + + Parameters + ---------- + table : a table element that contains row elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tables(self, document, match, attrs): + """ + Return all tables from the parsed DOM. + + Parameters + ---------- + document : the DOM from which to parse the table element. + + match : str or regular expression + The text to search for in the DOM tree. + + attrs : dict + A dictionary of table attributes that can be used to disambiguate + multiple tables on a page. + + Raises + ------ + ValueError : `match` does not match any text in the document. + + Returns + ------- + list of node-like + HTML
elements to be parsed into raw data. + """ + raise AbstractMethodError(self) + + def _equals_tag(self, obj, tag) -> bool: + """ + Return whether an individual DOM node matches a tag + + Parameters + ---------- + obj : node-like + A DOM node. + + tag : str + Tag name to be checked for equality. + + Returns + ------- + boolean + Whether `obj`'s tag name is `tag` + """ + raise AbstractMethodError(self) + + def _build_doc(self): + """ + Return a tree-like object that can be used to iterate over the DOM. + + Returns + ------- + node-like + The DOM from which to parse the table element. + """ + raise AbstractMethodError(self) + + def _parse_thead_tbody_tfoot(self, table_html): + """ + Given a table, return parsed header, body, and foot. + + Parameters + ---------- + table_html : node-like + + Returns + ------- + tuple of (header, body, footer), each a list of list-of-text rows. + + Notes + ----- + Header and body are lists-of-lists. Top level list is a list of + rows. Each row is a list of str text. + + Logic: Use , , elements to identify + header, body, and footer, otherwise: + - Put all rows into body + - Move rows from top of body to header only if + all elements inside row are . Move the top all- or + while body_rows and row_is_all_th(body_rows[0]): + header_rows.append(body_rows.pop(0)) + + header = self._expand_colspan_rowspan(header_rows, section="header") + body = self._expand_colspan_rowspan(body_rows, section="body") + footer = self._expand_colspan_rowspan(footer_rows, section="footer") + + return header, body, footer + + def _expand_colspan_rowspan( + self, rows, section: Literal["header", "footer", "body"] + ) -> list[list]: + """ + Given a list of s, return a list of text rows. + + Parameters + ---------- + rows : list of node-like + List of s + section : the section that the rows belong to (header, body or footer). + + Returns + ------- + list of list + Each returned row is a list of str text, or tuple (text, link) + if extract_links is not None. + + Notes + ----- + Any cell with ``rowspan`` or ``colspan`` will have its contents copied + to subsequent cells. + """ + all_texts = [] # list of rows, each a list of str + text: str | tuple + remainder: list[ + tuple[int, str | tuple, int] + ] = [] # list of (index, text, nrows) + + for tr in rows: + texts = [] # the output for this row + next_remainder = [] + + index = 0 + tds = self._parse_td(tr) + for td in tds: + # Append texts from previous rows with rowspan>1 that come + # before this or (see _parse_thead_tr). + return row.xpath("./td|./th") + + def _parse_tables(self, document, match, kwargs): + pattern = match.pattern + + # 1. check all descendants for the given pattern and only search tables + # GH 49929 + xpath_expr = f"//table[.//text()[re:test(., {pattern!r})]]" + + # if any table attributes were given build an xpath expression to + # search for them + if kwargs: + xpath_expr += _build_xpath_expr(kwargs) + + tables = document.xpath(xpath_expr, namespaces=_re_namespace) + + tables = self._handle_hidden_tables(tables, "attrib") + if self.displayed_only: + for table in tables: + # lxml utilizes XPATH 1.0 which does not have regex + # support. As a result, we find all elements with a style + # attribute and iterate them to check for display:none + for elem in table.xpath(".//style"): + elem.drop_tree() + for elem in table.xpath(".//*[@style]"): + if "display:none" in elem.attrib.get("style", "").replace(" ", ""): + elem.drop_tree() + if not tables: + raise ValueError(f"No tables found matching regex {pattern!r}") + return tables + + def _equals_tag(self, obj, tag) -> bool: + return obj.tag == tag + + def _build_doc(self): + """ + Raises + ------ + ValueError + * If a URL that lxml cannot parse is passed. + + Exception + * Any other ``Exception`` thrown. For example, trying to parse a + URL that is syntactically correct on a machine with no internet + connection will fail. + + See Also + -------- + pandas.io.html._HtmlFrameParser._build_doc + """ + from lxml.etree import XMLSyntaxError + from lxml.html import ( + HTMLParser, + parse, + ) + + parser = HTMLParser(recover=True, encoding=self.encoding) + + if is_url(self.io): + with get_handle(self.io, "r", storage_options=self.storage_options) as f: + r = parse(f.handle, parser=parser) + else: + # try to parse the input in the simplest way + try: + r = parse(self.io, parser=parser) + except OSError as err: + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {self.io}" + ) from err + try: + r = r.getroot() + except AttributeError: + pass + else: + if not hasattr(r, "text_content"): + raise XMLSyntaxError("no text parsed from document", 0, 0, 0) + + for br in r.xpath("*//br"): + br.tail = "\n" + (br.tail or "") + + return r + + def _parse_thead_tr(self, table): + rows = [] + + for thead in table.xpath(".//thead"): + rows.extend(thead.xpath("./tr")) + + # HACK: lxml does not clean up the clearly-erroneous + # . (Missing ). Add + # the and _pretend_ it's a ; _parse_td() will find its + # children as though it's a . + # + # Better solution would be to use html5lib. + elements_at_root = thead.xpath("./td|./th") + if elements_at_root: + rows.append(thead) + + return rows + + def _parse_tbody_tr(self, table): + from_tbody = table.xpath(".//tbody//tr") + from_root = table.xpath("./tr") + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.xpath(".//tfoot//tr") + + +def _expand_elements(body) -> None: + data = [len(elem) for elem in body] + lens = Series(data) + lens_max = lens.max() + not_max = lens[lens != lens_max] + + empty = [""] + for ind, length in not_max.items(): + body[ind] += empty * (lens_max - length) + + +def _data_to_frame(**kwargs): + head, body, foot = kwargs.pop("data") + header = kwargs.pop("header") + kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) + if head: + body = head + body + + # Infer header when there is a or top
+ - Move rows from bottom of body to footer only if + all elements inside row are + """ + header_rows = self._parse_thead_tr(table_html) + body_rows = self._parse_tbody_tr(table_html) + footer_rows = self._parse_tfoot_tr(table_html) + + def row_is_all_th(row): + return all(self._equals_tag(t, "th") for t in self._parse_td(row)) + + if not header_rows: + # The table has no
rows from + # body_rows to header_rows. (This is a common case because many + # tables in the wild have no
+ while remainder and remainder[0][0] <= index: + prev_i, prev_text, prev_rowspan = remainder.pop(0) + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + index += 1 + + # Append the text from this , colspan times + text = _remove_whitespace(self._text_getter(td)) + if self.extract_links in ("all", section): + href = self._href_getter(td) + text = (text, href) + rowspan = int(self._attr_getter(td, "rowspan") or 1) + colspan = int(self._attr_getter(td, "colspan") or 1) + + for _ in range(colspan): + texts.append(text) + if rowspan > 1: + next_remainder.append((index, text, rowspan - 1)) + index += 1 + + # Append texts from previous rows at the final position + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + + all_texts.append(texts) + remainder = next_remainder + + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder + + return all_texts + + def _handle_hidden_tables(self, tbl_list, attr_name: str): + """ + Return list of tables, potentially removing hidden elements + + Parameters + ---------- + tbl_list : list of node-like + Type of list elements will vary depending upon parser used + attr_name : str + Name of the accessor for retrieving HTML attributes + + Returns + ------- + list of node-like + Return type matches `tbl_list` + """ + if not self.displayed_only: + return tbl_list + + return [ + x + for x in tbl_list + if "display:none" + not in getattr(x, attr_name).get("style", "").replace(" ", "") + ] + + +class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): + """ + HTML to DataFrame parser that uses BeautifulSoup under the hood. + + See Also + -------- + pandas.io.html._HtmlFrameParser + pandas.io.html._LxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`pandas.io.html._HtmlFrameParser`. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + from bs4 import SoupStrainer + + self._strainer = SoupStrainer("table") + + def _parse_tables(self, document, match, attrs): + element_name = self._strainer.name + tables = document.find_all(element_name, attrs=attrs) + if not tables: + raise ValueError("No tables found") + + result = [] + unique_tables = set() + tables = self._handle_hidden_tables(tables, "attrs") + + for table in tables: + if self.displayed_only: + for elem in table.find_all("style"): + elem.decompose() + + for elem in table.find_all(style=re.compile(r"display:\s*none")): + elem.decompose() + + if table not in unique_tables and table.find(string=match) is not None: + result.append(table) + unique_tables.add(table) + if not result: + raise ValueError(f"No tables found matching pattern {match.pattern!r}") + return result + + def _href_getter(self, obj) -> str | None: + a = obj.find("a", href=True) + return None if not a else a["href"] + + def _text_getter(self, obj): + return obj.text + + def _equals_tag(self, obj, tag) -> bool: + return obj.name == tag + + def _parse_td(self, row): + return row.find_all(("td", "th"), recursive=False) + + def _parse_thead_tr(self, table): + return table.select("thead tr") + + def _parse_tbody_tr(self, table): + from_tbody = table.select("tbody tr") + from_root = table.find_all("tr", recursive=False) + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.select("tfoot tr") + + def _setup_build_doc(self): + raw_text = _read(self.io, self.encoding, self.storage_options) + if not raw_text: + raise ValueError(f"No text parsed from document: {self.io}") + return raw_text + + def _build_doc(self): + from bs4 import BeautifulSoup + + bdoc = self._setup_build_doc() + if isinstance(bdoc, bytes) and self.encoding is not None: + udoc = bdoc.decode(self.encoding) + from_encoding = None + else: + udoc = bdoc + from_encoding = self.encoding + + soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) + + for br in soup.find_all("br"): + br.replace_with("\n" + br.text) + + return soup + + +def _build_xpath_expr(attrs) -> str: + """ + Build an xpath expression to simulate bs4's ability to pass in kwargs to + search for attributes when using the lxml parser. + + Parameters + ---------- + attrs : dict + A dict of HTML attributes. These are NOT checked for validity. + + Returns + ------- + expr : unicode + An XPath expression that checks for the given HTML attributes. + """ + # give class attribute as class_ because class is a python keyword + if "class_" in attrs: + attrs["class"] = attrs.pop("class_") + + s = " and ".join([f"@{k}={v!r}" for k, v in attrs.items()]) + return f"[{s}]" + + +_re_namespace = {"re": "http://exslt.org/regular-expressions"} + + +class _LxmlFrameParser(_HtmlFrameParser): + """ + HTML to DataFrame parser that uses lxml under the hood. + + Warning + ------- + This parser can only handle HTTP, FTP, and FILE urls. + + See Also + -------- + _HtmlFrameParser + _BeautifulSoupLxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`_HtmlFrameParser`. + """ + + def _href_getter(self, obj) -> str | None: + href = obj.xpath(".//a/@href") + return None if not href else href[0] + + def _text_getter(self, obj): + return obj.text_content() + + def _parse_td(self, row): + # Look for direct children only: the "row" element here may be a + #
foobar
-only rows + if header is None: + if len(head) == 1: + header = 0 + else: + # ignore all-empty-text rows + header = [i for i, row in enumerate(head) if any(text for text in row)] + + if foot: + body += foot + + # fill out elements of body that are "ragged" + _expand_elements(body) + with TextParser(body, header=header, **kwargs) as tp: + return tp.read() + + +_valid_parsers = { + "lxml": _LxmlFrameParser, + None: _LxmlFrameParser, + "html5lib": _BeautifulSoupHtml5LibFrameParser, + "bs4": _BeautifulSoupHtml5LibFrameParser, +} + + +def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]: + """ + Choose the parser based on the input flavor. + + Parameters + ---------- + flavor : {{"lxml", "html5lib", "bs4"}} or None + The type of parser to use. This must be a valid backend. + + Returns + ------- + cls : _HtmlFrameParser subclass + The parser class based on the requested input flavor. + + Raises + ------ + ValueError + * If `flavor` is not a valid backend. + ImportError + * If you do not have the requested `flavor` + """ + valid_parsers = list(_valid_parsers.keys()) + if flavor not in valid_parsers: + raise ValueError( + f"{flavor!r} is not a valid flavor, valid flavors are {valid_parsers}" + ) + + if flavor in ("bs4", "html5lib"): + import_optional_dependency("html5lib") + import_optional_dependency("bs4") + else: + import_optional_dependency("lxml.etree") + return _valid_parsers[flavor] + + +def _print_as_set(s) -> str: + arg = ", ".join([pprint_thing(el) for el in s]) + return f"{{{arg}}}" + + +def _validate_flavor(flavor): + if flavor is None: + flavor = "lxml", "bs4" + elif isinstance(flavor, str): + flavor = (flavor,) + elif isinstance(flavor, abc.Iterable): + if not all(isinstance(flav, str) for flav in flavor): + raise TypeError( + f"Object of type {type(flavor).__name__!r} " + f"is not an iterable of strings" + ) + else: + msg = repr(flavor) if isinstance(flavor, str) else str(flavor) + msg += " is not a valid flavor" + raise ValueError(msg) + + flavor = tuple(flavor) + valid_flavors = set(_valid_parsers) + flavor_set = set(flavor) + + if not flavor_set & valid_flavors: + raise ValueError( + f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid " + f"flavors are {_print_as_set(valid_flavors)}" + ) + return flavor + + +def _parse( + flavor, + io, + match, + attrs, + encoding, + displayed_only, + extract_links, + storage_options, + **kwargs, +): + flavor = _validate_flavor(flavor) + compiled_match = re.compile(match) # you can pass a compiled regex here + + retained = None + for flav in flavor: + parser = _parser_dispatch(flav) + p = parser( + io, + compiled_match, + attrs, + encoding, + displayed_only, + extract_links, + storage_options, + ) + + try: + tables = p.parse_tables() + except ValueError as caught: + # if `io` is an io-like object, check if it's seekable + # and try to rewind it before trying the next parser + if hasattr(io, "seekable") and io.seekable(): + io.seek(0) + elif hasattr(io, "seekable") and not io.seekable(): + # if we couldn't rewind it, let the user know + raise ValueError( + f"The flavor {flav} failed to parse your input. " + "Since you passed a non-rewindable file " + "object, we can't rewind it to try " + "another parser. Try read_html() with a different flavor." + ) from caught + + retained = caught + else: + break + else: + assert retained is not None # for mypy + raise retained + + ret = [] + for table in tables: + try: + df = _data_to_frame(data=table, **kwargs) + # Cast MultiIndex header to an Index of tuples when extracting header + # links and replace nan with None (therefore can't use mi.to_flat_index()). + # This maintains consistency of selection (e.g. df.columns.str[1]) + if extract_links in ("all", "header") and isinstance( + df.columns, MultiIndex + ): + df.columns = Index( + ((col[0], None if isna(col[1]) else col[1]) for col in df.columns), + tupleize_cols=False, + ) + + ret.append(df) + except EmptyDataError: # empty table + continue + return ret + + +@doc(storage_options=_shared_docs["storage_options"]) +def read_html( + io: FilePath | ReadBuffer[str], + *, + match: str | Pattern = ".+", + flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None, + header: int | Sequence[int] | None = None, + index_col: int | Sequence[int] | None = None, + skiprows: int | Sequence[int] | slice | None = None, + attrs: dict[str, str] | None = None, + parse_dates: bool = False, + thousands: str | None = ",", + encoding: str | None = None, + decimal: str = ".", + converters: dict | None = None, + na_values: Iterable[object] | None = None, + keep_default_na: bool = True, + displayed_only: bool = True, + extract_links: Literal[None, "header", "footer", "body", "all"] = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + storage_options: StorageOptions = None, +) -> list[DataFrame]: + r""" + Read HTML tables into a ``list`` of ``DataFrame`` objects. + + Parameters + ---------- + io : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``read()`` function. + The string can represent a URL. Note that + lxml only accepts the http, ftp and file url protocols. If you have a + URL that starts with ``'https'`` you might try removing the ``'s'``. + + .. deprecated:: 2.1.0 + Passing html literal strings is deprecated. + Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead. + + match : str or compiled regular expression, optional + The set of tables containing text matching this regex or string will be + returned. Unless the HTML is extremely simple you will probably need to + pass a non-empty string here. Defaults to '.+' (match any non-empty + string). The default value will return all tables contained on a page. + This value is converted to a regular expression so that there is + consistent behavior between Beautiful Soup and lxml. + + flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional + The parsing engine (or list of parsing engines) to use. 'bs4' and + 'html5lib' are synonymous with each other, they are both there for + backwards compatibility. The default of ``None`` tries to use ``lxml`` + to parse and if that fails it falls back on ``bs4`` + ``html5lib``. + + header : int or list-like, optional + The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to + make the columns headers. + + index_col : int or list-like, optional + The column (or list of columns) to use to create the index. + + skiprows : int, list-like or slice, optional + Number of rows to skip after parsing the column integer. 0-based. If a + sequence of integers or a slice is given, will skip the rows indexed by + that sequence. Note that a single element sequence means 'skip the nth + row' whereas an integer means 'skip n rows'. + + attrs : dict, optional + This is a dictionary of attributes that you can pass to use to identify + the table in the HTML. These are not checked for validity before being + passed to lxml or Beautiful Soup. However, these attributes must be + valid HTML table attributes to work correctly. For example, :: + + attrs = {{"id": "table"}} + + is a valid attribute dictionary because the 'id' HTML tag attribute is + a valid HTML attribute for *any* HTML tag as per `this document + `__. :: + + attrs = {{"asdf": "table"}} + + is *not* a valid attribute dictionary because 'asdf' is not a valid + HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 + table attributes can be found `here + `__. A + working draft of the HTML 5 spec can be found `here + `__. It contains the + latest information on table attributes for the modern web. + + parse_dates : bool, optional + See :func:`~read_csv` for more details. + + thousands : str, optional + Separator to use to parse thousands. Defaults to ``','``. + + encoding : str, optional + The encoding used to decode the web page. Defaults to ``None``.``None`` + preserves the previous encoding behavior, which depends on the + underlying parser library (e.g., the parser library will try to use + the encoding provided by the document). + + decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European + data). + + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + + na_values : iterable, default None + Custom NA values. + + keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to. + + displayed_only : bool, default True + Whether elements with "display: none" should be parsed. + + extract_links : {{None, "all", "header", "body", "footer"}} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + {storage_options} + + .. versionadded:: 2.1.0 + + Returns + ------- + dfs + A list of DataFrames. + + See Also + -------- + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Notes + ----- + Before using this function you should read the :ref:`gotchas about the + HTML parsing libraries `. + + Expect to do some cleanup after you call this function. For example, you + might need to manually assign column names if the column names are + converted to NaN when you pass the `header=0` argument. We try to assume as + little as possible about the structure of the table and push the + idiosyncrasies of the HTML contained in the table to the user. + + This function searches for ```` elements and only for ```` + and ```` or ```` argument, it is used to construct + the header, otherwise the function attempts to find the header within + the body (by putting rows with only ``
`` rows and ```` elements within each ``
`` + element in the table. ```` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``
`` elements into the header). + + Similar to :func:`~read_csv` the `header` argument is applied + **after** `skiprows` is applied. + + This function will *always* return a list of :class:`DataFrame` *or* + it will fail, e.g., it will *not* return an empty list. + + Examples + -------- + See the :ref:`read_html documentation in the IO section of the docs + ` for some examples of reading in HTML tables. + """ + # Type check here. We don't want to parse only to fail because of an + # invalid value of an integer skiprows. + if isinstance(skiprows, numbers.Integral) and skiprows < 0: + raise ValueError( + "cannot skip rows starting from the end of the " + "data (you passed a negative value)" + ) + if extract_links not in [None, "header", "footer", "body", "all"]: + raise ValueError( + "`extract_links` must be one of " + '{None, "header", "footer", "body", "all"}, got ' + f'"{extract_links}"' + ) + + validate_header_arg(header) + check_dtype_backend(dtype_backend) + + io = stringify_path(io) + + return _parse( + flavor=flavor, + io=io, + match=match, + header=header, + index_col=index_col, + skiprows=skiprows, + parse_dates=parse_dates, + thousands=thousands, + attrs=attrs, + encoding=encoding, + decimal=decimal, + converters=converters, + na_values=na_values, + keep_default_na=keep_default_na, + displayed_only=displayed_only, + extract_links=extract_links, + dtype_backend=dtype_backend, + storage_options=storage_options, + ) diff --git a/crates/ruff_server/src/edit.rs b/crates/ruff_server/src/edit.rs new file mode 100644 index 00000000000000..3e2e696607690a --- /dev/null +++ b/crates/ruff_server/src/edit.rs @@ -0,0 +1,48 @@ +//! Types and utilities for working with text, modifying source files, and `Ruff <-> LSP` type conversion. + +mod document; +mod range; + +pub use document::Document; +use lsp_types::PositionEncodingKind; +pub(crate) use range::{text_range, text_range_to_range}; + +/// A convenient enumeration for supported text encodings. Can be converted to [`lsp_types::PositionEncodingKind`]. +#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)] +/// UTF 16 is the encoding supported by all LSP clients. +pub enum PositionEncoding { + #[default] + UTF16, + + /// Ruff's preferred encoding + UTF8, + + /// Second choice because UTF32 uses a fixed 4 byte encoding for each character (makes conversion relatively easy) + UTF32, +} + +impl From for lsp_types::PositionEncodingKind { + fn from(value: PositionEncoding) -> Self { + match value { + PositionEncoding::UTF8 => lsp_types::PositionEncodingKind::UTF8, + PositionEncoding::UTF16 => lsp_types::PositionEncodingKind::UTF16, + PositionEncoding::UTF32 => lsp_types::PositionEncodingKind::UTF32, + } + } +} + +impl TryFrom for PositionEncoding { + type Error = (); + + fn try_from(value: PositionEncodingKind) -> Result { + Ok(if value == PositionEncodingKind::UTF8 { + PositionEncoding::UTF8 + } else if value == PositionEncodingKind::UTF16 { + PositionEncoding::UTF16 + } else if value == PositionEncodingKind::UTF32 { + PositionEncoding::UTF32 + } else { + return Err(()); + }) + } +} diff --git a/crates/ruff_server/src/edit/document.rs b/crates/ruff_server/src/edit/document.rs new file mode 100644 index 00000000000000..24c56309a1d581 --- /dev/null +++ b/crates/ruff_server/src/edit/document.rs @@ -0,0 +1,120 @@ +use lsp_types::{Position, TextDocumentContentChangeEvent}; +use ruff_source_file::LineIndex; + +use crate::PositionEncoding; + +use super::range::text_range; + +#[derive(Debug, Clone)] +pub struct Document { + contents: String, + index: LineIndex, + version: i32, +} + +impl Document { + pub fn new(contents: String, version: i32) -> Self { + let index = LineIndex::from_source_text(&contents); + Self { + contents, + index, + version, + } + } + // TODO(jane): I would personally be in favor of removing access to this method and only + // allowing document mutation via specialized methods. + pub(crate) fn modify(&mut self, func: impl FnOnce(&mut String, &mut i32)) { + self.modify_with_manual_index(|c, v, i| { + func(c, v); + *i = LineIndex::from_source_text(c); + }); + } + + // A private function for overriding how we update the line index by default. + fn modify_with_manual_index( + &mut self, + func: impl FnOnce(&mut String, &mut i32, &mut LineIndex), + ) { + let old_version = self.version; + func(&mut self.contents, &mut self.version, &mut self.index); + debug_assert!(self.version >= old_version); + } +} + +/* Mutable API */ +impl Document { + pub fn apply_changes( + &mut self, + changes: Vec, + new_version: i32, + encoding: PositionEncoding, + ) { + if let [lsp_types::TextDocumentContentChangeEvent { + range: None, text, .. + }] = changes.as_slice() + { + tracing::debug!("Fast path - replacing entire document"); + self.modify(|contents, version| { + *contents = text.clone(); + *version = new_version; + }); + return; + } + + let mut new_contents = self.contents().to_string(); + let mut active_index = None; + + let mut last_position = Position { + line: u32::MAX, + character: u32::MAX, + }; + + for TextDocumentContentChangeEvent { + range, + text: change, + .. + } in changes + { + if let Some(range) = range { + if last_position <= range.end { + active_index.replace(LineIndex::from_source_text(&new_contents)); + } + + last_position = range.start; + let range = text_range( + range, + &new_contents, + active_index.as_ref().unwrap_or(self.index()), + encoding, + ); + + new_contents.replace_range( + usize::from(range.start())..usize::from(range.end()), + &change, + ); + } else { + new_contents = change; + last_position = Position::default(); + } + } + + self.modify_with_manual_index(|contents, version, index| { + *index = LineIndex::from_source_text(&new_contents); + *contents = new_contents; + *version = new_version; + }); + } +} + +/* Immutable API */ +impl Document { + pub fn contents(&self) -> &str { + &self.contents + } + pub fn index(&self) -> &LineIndex { + &self.index + } + pub fn version(&self) -> i32 { + self.version + } +} diff --git a/crates/ruff_server/src/edit/range.rs b/crates/ruff_server/src/edit/range.rs new file mode 100644 index 00000000000000..669dad2009f844 --- /dev/null +++ b/crates/ruff_server/src/edit/range.rs @@ -0,0 +1,148 @@ +use super::{Document, PositionEncoding}; +use lsp_types as types; +use ruff_source_file::OneIndexed; +use ruff_source_file::{LineIndex, SourceLocation}; +use ruff_text_size::{TextRange, TextSize}; + +/// Returns the [`TextRange`] for a LSP [`Range`] respecting the negotiated [`PositionEncoding`]. +pub(crate) fn text_range( + range: types::Range, + text: &str, + index: &LineIndex, + encoding: PositionEncoding, +) -> TextRange { + let start_line = index.line_range( + OneIndexed::from_zero_indexed(range.start.line as usize), + text, + ); + let end_line = index.line_range(OneIndexed::from_zero_indexed(range.end.line as usize), text); + + let (start_column_offset, end_column_offset) = match encoding { + PositionEncoding::UTF8 => ( + TextSize::new(range.start.character), + TextSize::new(range.end.character), + ), + + PositionEncoding::UTF16 => { + // Fast path for ASCII only documents + if index.is_ascii() { + ( + TextSize::new(range.start.character), + TextSize::new(range.end.character), + ) + } else { + // UTF16 encodes characters either as one or two 16 bit words. + // The position in `range` is the 16-bit word offset from the start of the line (and not the character offset) + // UTF-16 with a text that may use variable-length characters. + ( + utf16_column_offset(range.start.character, &text[start_line]), + utf16_column_offset(range.end.character, &text[end_line]), + ) + } + } + PositionEncoding::UTF32 => { + // UTF-32 uses 4 bytes for each character. Meaning, the position in range is a character offset. + return TextRange::new( + index.offset( + OneIndexed::from_zero_indexed(range.start.line as usize), + OneIndexed::from_zero_indexed(range.start.character as usize), + text, + ), + index.offset( + OneIndexed::from_zero_indexed(range.end.line as usize), + OneIndexed::from_zero_indexed(range.end.character as usize), + text, + ), + ); + } + }; + + TextRange::new( + start_line.start() + start_column_offset.clamp(TextSize::new(0), start_line.end()), + end_line.start() + end_column_offset.clamp(TextSize::new(0), end_line.end()), + ) +} + +fn utf16_column_offset(character: u32, line: &str) -> TextSize { + let mut character_offset = TextSize::new(0); + + let mut i = 0u32; + + for c in line.chars() { + if i >= character { + break; + } + + // Count characters encoded as two 16 bit words as 2 characters. + // SAFETY: Value is always between 1 and 2, casting to u32 is safe. + #[allow(clippy::cast_possible_truncation)] + { + character_offset += TextSize::new(c.len_utf8() as u32); + i += c.len_utf16() as u32; + } + } + + character_offset +} + +pub(crate) fn offset_to_position( + offset: TextSize, + text: &str, + index: &LineIndex, + encoding: PositionEncoding, +) -> types::Position { + let location = match encoding { + PositionEncoding::UTF8 => { + let row = index.line_index(offset); + let column = offset - index.line_start(row, text); + + SourceLocation { + column: OneIndexed::from_zero_indexed(column.to_usize()), + row, + } + } + PositionEncoding::UTF16 => { + let row = index.line_index(offset); + + let column = if index.is_ascii() { + (offset - index.line_start(row, text)).to_usize() + } else { + let up_to_line = &text[TextRange::new(index.line_start(row, text), offset)]; + up_to_line.encode_utf16().count() + }; + + SourceLocation { + column: OneIndexed::from_zero_indexed(column), + row, + } + } + PositionEncoding::UTF32 => index.source_location(offset, text), + }; + + #[allow(clippy::cast_possible_truncation)] + types::Position { + line: location.row.to_zero_indexed() as u32, + character: location.column.to_zero_indexed() as u32, + } +} + +pub(crate) fn text_range_to_range( + text_range: TextRange, + document: &Document, + encoding: PositionEncoding, +) -> types::Range { + types::Range { + start: offset_to_position( + text_range.start(), + document.contents(), + document.index(), + encoding, + ), + end: offset_to_position( + text_range.end(), + document.contents(), + document.index(), + encoding, + ), + } +} diff --git a/crates/ruff_server/src/format.rs b/crates/ruff_server/src/format.rs new file mode 100644 index 00000000000000..b747f46d5b0fcf --- /dev/null +++ b/crates/ruff_server/src/format.rs @@ -0,0 +1,29 @@ +use ruff_formatter::PrintedRange; +use ruff_python_formatter::{format_module_source, format_range}; +use ruff_text_size::TextRange; +use ruff_workspace::FormatterSettings; + +use crate::edit::Document; + +pub(crate) fn format( + document: &Document, + formatter_settings: &FormatterSettings, +) -> crate::Result { + // TODO(jane): support Jupyter Notebook + let format_options = formatter_settings + .to_format_options(ruff_python_ast::PySourceType::Python, document.contents()); + let formatted = format_module_source(document.contents(), format_options)?; + Ok(formatted.into_code()) +} + +pub(crate) fn range_format( + document: &Document, + formatter_settings: &FormatterSettings, + range: TextRange, +) -> crate::Result { + // TODO(jane): support Jupyter Notebook + let format_options = formatter_settings + .to_format_options(ruff_python_ast::PySourceType::Python, document.contents()); + + Ok(format_range(document.contents(), range, format_options)?) +} diff --git a/crates/ruff_server/src/lib.rs b/crates/ruff_server/src/lib.rs new file mode 100644 index 00000000000000..a284c7e162730f --- /dev/null +++ b/crates/ruff_server/src/lib.rs @@ -0,0 +1,23 @@ +//! ## The Ruff Language Server + +// Use +pub use edit::{Document, PositionEncoding}; +pub use server::Server; +const SERVER_NAME: &str = "ruff"; +const DIAGNOSTIC_NAME: &str = "Ruff"; + +fn version() -> &'static str { + env!("CARGO_PKG_VERSION") +} + +// Modules +mod edit; +mod format; +mod lint; +mod server; +mod session; + +// Types +/// A common result type used in most cases where a +/// result type is needed. +pub(crate) type Result = anyhow::Result; diff --git a/crates/ruff_server/src/lint.rs b/crates/ruff_server/src/lint.rs new file mode 100644 index 00000000000000..6119dde72d26bc --- /dev/null +++ b/crates/ruff_server/src/lint.rs @@ -0,0 +1,119 @@ +//! Access to the Ruff linting API for the LSP + +use std::path::Path; + +use ruff_diagnostics::{Applicability, Diagnostic, DiagnosticKind, Fix}; +use ruff_linter::{ + directives::{extract_directives, Flags}, + linter::{check_path, LinterResult, TokenSource}, + registry::AsRule, + settings::{flags, LinterSettings}, + source_kind::SourceKind, +}; +use ruff_python_ast::PySourceType; +use ruff_python_codegen::Stylist; +use ruff_python_index::Indexer; +use ruff_python_parser::lexer::LexResult; +use ruff_python_parser::AsMode; +use ruff_source_file::Locator; +use serde::{Deserialize, Serialize}; + +use crate::{PositionEncoding, DIAGNOSTIC_NAME}; + +#[derive(Serialize, Deserialize)] +pub(crate) struct DiagnosticFix { + pub(crate) kind: DiagnosticKind, + pub(crate) fix: Fix, +} + +pub(crate) fn check( + document: &crate::edit::Document, + linter_settings: &LinterSettings, + encoding: PositionEncoding, +) -> Vec { + let contents = document.contents(); + + let source_type = PySourceType::default(); + + // TODO(jane): Support Jupyter Notebooks + let source_kind = SourceKind::Python(contents.to_string()); + + // Tokenize once. + let tokens: Vec = ruff_python_parser::tokenize(contents, source_type.as_mode()); + + // Map row and column locations to byte slices (lazily). + let locator = Locator::new(contents); + + // Detect the current code style (lazily). + let stylist = Stylist::from_tokens(&tokens, &locator); + + // Extra indices from the code. + let indexer = Indexer::from_tokens(&tokens, &locator); + + // Extract the `# noqa` and `# isort: skip` directives from the source. + let directives = extract_directives(&tokens, Flags::empty(), &locator, &indexer); + + // Generate checks. + let LinterResult { + data: (diagnostics, _imports), + .. + } = check_path( + Path::new(""), + None, + &locator, + &stylist, + &indexer, + &directives, + linter_settings, + flags::Noqa::Enabled, + &source_kind, + source_type, + TokenSource::Tokens(tokens), + ); + + diagnostics + .into_iter() + .map(|diagnostic| to_lsp_diagnostic(diagnostic, document, encoding)) + .collect() +} + +fn to_lsp_diagnostic( + diagnostic: Diagnostic, + document: &crate::edit::Document, + encoding: PositionEncoding, +) -> lsp_types::Diagnostic { + let Diagnostic { + kind, range, fix, .. + } = diagnostic; + + let rule = kind.rule(); + + let data = fix.and_then(|fix| { + fix.applies(Applicability::Unsafe) + .then(|| { + serde_json::to_value(&DiagnosticFix { + kind: kind.clone(), + fix, + }) + .ok() + }) + .flatten() + }); + lsp_types::Diagnostic { + range: crate::edit::text_range_to_range(range, document, encoding), + severity: Some(lsp_types::DiagnosticSeverity::ERROR), + code: Some(lsp_types::NumberOrString::String( + rule.noqa_code().to_string(), + )), + code_description: rule.url().and_then(|url| { + Some(lsp_types::CodeDescription { + href: lsp_types::Url::parse(&url).ok()?, + }) + }), + source: Some(DIAGNOSTIC_NAME.into()), + message: kind.body, + related_information: None, + tags: None, + data, + } +} diff --git a/crates/ruff_server/src/server.rs b/crates/ruff_server/src/server.rs new file mode 100644 index 00000000000000..9295bd86a4a67e --- /dev/null +++ b/crates/ruff_server/src/server.rs @@ -0,0 +1,152 @@ +//! Scheduling, I/O, and API endpoints. + +use anyhow::anyhow; +use lsp::Connection; +use lsp_server as lsp; +use lsp_types as types; +use types::ClientCapabilities; +use types::CodeActionKind; +use types::CodeActionOptions; +use types::DiagnosticOptions; +use types::OneOf; +use types::PositionEncodingKind; +use types::TextDocumentSyncCapability; +use types::TextDocumentSyncKind; +use types::TextDocumentSyncOptions; +use types::WorkDoneProgressOptions; +use types::WorkspaceFoldersServerCapabilities; + +use self::schedule::main_thread; +use crate::session::Session; + +mod api; +mod client; +mod schedule; + +pub(crate) type Result = std::result::Result; + +pub struct Server { + conn: lsp::Connection, + threads: lsp::IoThreads, + session: Session, +} + +impl Server { + pub fn new() -> crate::Result { + let (conn, threads) = lsp::Connection::stdio(); + + let (id, params) = conn.initialize_start()?; + + let init_params: types::InitializeParams = serde_json::from_value(params)?; + + let client_capabilities = init_params.capabilities; + let server_capabilities = Self::server_capabilities(&client_capabilities); + + let workspaces = init_params + .workspace_folders + .map(|f| f.into_iter().map(|w| w.uri).collect()) + .or_else(|| init_params.root_uri.map(|u| vec![u])).ok_or_else(|| anyhow!("No workspace or root URI was given in the LSP initialization parameters. The server cannot start."))?; + + let initialize_data = serde_json::json!({ + "capabilities": server_capabilities, + "serverInfo": { + "name": crate::SERVER_NAME, + "version": crate::version() + } + }); + + conn.initialize_finish(id, initialize_data)?; + + Ok(Self { + conn, + threads, + session: Session::new(&server_capabilities, &workspaces)?, + }) + } + + pub fn run(self) -> crate::Result<()> { + let result = main_thread(move || Self::main_loop(&self.conn, self.session))?.join(); + self.threads.join()?; + result + } + + fn main_loop(connection: &Connection, session: Session) -> crate::Result<()> { + // TODO(jane): Make thread count configurable + let mut scheduler = schedule::Scheduler::new(session, 4, &connection.sender); + for msg in &connection.receiver { + scheduler.process_events(); + let task = match msg { + lsp::Message::Request(req) => { + if connection.handle_shutdown(&req)? { + return Ok(()); + } + api::request(req) + } + lsp::Message::Notification(notification) => api::notification(notification), + lsp::Message::Response(response) => { + tracing::error!( + "Expected request or notification, got response instead: {response:?}" + ); + continue; + } + }; + scheduler.dispatch(task); + } + Ok(()) + } + + fn server_capabilities(client_capabilities: &ClientCapabilities) -> types::ServerCapabilities { + let position_encoding = client_capabilities + .general + .as_ref() + .and_then(|g| g.position_encodings.as_ref()) + .and_then(|e| e.first()) + .cloned() + .unwrap_or(PositionEncodingKind::UTF8); + types::ServerCapabilities { + position_encoding: Some(position_encoding), + code_action_provider: Some(types::CodeActionProviderCapability::Options( + CodeActionOptions { + code_action_kinds: Some(vec![ + CodeActionKind::QUICKFIX, + CodeActionKind::SOURCE_ORGANIZE_IMPORTS, + ]), + work_done_progress_options: WorkDoneProgressOptions { + work_done_progress: Some(true), + }, + resolve_provider: Some(false), + }, + )), + workspace: Some(types::WorkspaceServerCapabilities { + workspace_folders: Some(WorkspaceFoldersServerCapabilities { + supported: Some(true), + change_notifications: Some(OneOf::Left(true)), + }), + file_operations: None, + }), + document_formatting_provider: Some(OneOf::Left(true)), + document_range_formatting_provider: Some(OneOf::Left(true)), + diagnostic_provider: Some(types::DiagnosticServerCapabilities::Options( + DiagnosticOptions { + identifier: Some(crate::DIAGNOSTIC_NAME.into()), + // multi-file analysis could change this + inter_file_dependencies: false, + workspace_diagnostics: false, + work_done_progress_options: WorkDoneProgressOptions { + work_done_progress: Some(true), + }, + }, + )), + text_document_sync: Some(TextDocumentSyncCapability::Options( + TextDocumentSyncOptions { + open_close: Some(true), + change: Some(TextDocumentSyncKind::INCREMENTAL), + will_save: Some(false), + will_save_wait_until: Some(false), + ..Default::default() + }, + )), + ..Default::default() + } + } +} diff --git a/crates/ruff_server/src/server/api.rs b/crates/ruff_server/src/server/api.rs new file mode 100644 index 00000000000000..aabe73f6c32863 --- /dev/null +++ b/crates/ruff_server/src/server/api.rs @@ -0,0 +1,148 @@ +use crate::server::schedule::Task; +use lsp_server as server; + +mod notifications; +mod requests; +mod traits; + +use notifications as notification; +use requests as request; +use traits::{BackgroundRequest, SyncNotification}; + +use super::Result; + +/// A helper macro for [`select_task`] that builds a task from a successful request match. +/// It can take optional configuration in the form of `{ use }` where `schedule` is +/// a constructor function of `Task`. This determines how the handler is scheduled. +macro_rules! handle_task { + // If no configuration is provided, we create a background task by default. + ($class: ty, $id: ident, $params: ident, $handle:ty) => { + handle_task!($class, $id, $params, $handle { use background }) + }; + // If we're building a sync task, the constructor takes slightly different + // arguments, so this needs to be a special case. + ($class: ty, $id: ident, $params: ident, $handle:ty { use local }) => { + Task::local(move |session, notifier, responder| { + let result = <$handle>::run(session, notifier, $params); + <$handle as $class>::respond($id, result, &responder); + }) + }; + // Otherwise, this is a builder for a background task of some `$schedule`. + // We don't have access to the session here, so we need to create a 'builder' closure + // around the inner task closure to take a snapshot when this task is dispatched. + ($class: ty, $id: ident, $params: ident, $handle:ty { use $schedule:ident }) => { + Task::$schedule(move |session| { + // TODO(jane): we should log an error if we can't take a snapshot. + let Some(snapshot) = session.take_snapshot(<$handle>::document_url(&$params)) else { return Box::new(|_, _| {}) }; + Box::new(move |notifier, responder| { + let result = <$handle>::run_with_snapshot(snapshot, notifier, $params); + <$handle as $class>::respond($id, result, &responder); + }) + }) + }; +} + +/// Defines logic to route a server message sub-type to a series of handlers that share +/// a specific `$class` - in this case, [`traits::Request`] and [`traits::Notification`] are valid +/// handler classes. This macro generates the construction of each possible task based on the provided handler implementations. +/// The scheduling configuration for each task is also set here. +macro_rules! select_task { + (match $req:ident as $class:ty { $($handle:ty$({ $($conf:tt)* })?),* $(,)? }) => { + (move || { + let build_err = |err| match err { + json_err @ lsp_server::ExtractError::JsonError { .. } => { + let err: anyhow::Error = json_err.into(); + anyhow::anyhow!("JSON parsing failure:\n{err}") + }, + lsp_server::ExtractError::MethodMismatch(_) => { + unreachable!("A method mismatch should not be possible here, unless the `cast` implementation for this request has been changed.") + } + }; + match $req.method.as_str() { + $(<$handle as $class>::METHOD => { + let (id, params) = <$handle as $class>::cast($req).map_err(build_err).with_failure_code(lsp_server::ErrorCode::ParseError)?; + Ok(handle_task!($class, id, params, $handle $({$($conf)*})?)) + }),* + _ => Err(anyhow::anyhow!("No route found for {:?}", $req)).with_failure_code(lsp_server::ErrorCode::MethodNotFound) + } + })() + }; +} + +macro_rules! define_document_url { + ($params:ident: &$p:ty) => { + fn document_url($params: &$p) -> &lsp_types::Url { + &$params.text_document.uri + } + }; +} + +use define_document_url; + +pub(in crate::server) fn request<'a>(req: server::Request) -> Task<'a> { + let id = req.id.clone(); + select_task! { + match req as traits::Request { + request::CodeAction { use low_latency_thread }, + request::Diagnostic { use low_latency_thread }, + request::Format { use fmt_thread }, + request::FormatRange { use fmt_thread }, + } + } + .unwrap_or_else(|err| { + tracing::error!("Encountered error when routing request: {err}"); + let result: Result<()> = Err(err); + Task::immediate(id, result) + }) +} + +pub(in crate::server) fn notification<'a>(notif: server::Notification) -> Task<'a> { + select_task! { + match notif as traits::Notification { + notification::Cancel { use local }, + notification::DidOpen { use local }, + notification::DidChange { use local }, + notification::DidChangeConfiguration { use local }, + notification::DidChangeWorkspace { use local }, + notification::DidClose { use local }, + } + } + .unwrap_or_else(|err| { + tracing::error!("Encountered error when routing notification: {err}"); + Task::nothing() + }) +} + +pub(crate) struct Error { + pub(crate) code: lsp_server::ErrorCode, + pub(crate) error: anyhow::Error, +} + +/// A trait to convert result types into the server result type, [`super::Result`]. +trait LSPResult { + fn with_failure_code(self, code: lsp_server::ErrorCode) -> super::Result; +} + +impl LSPResult for anyhow::Result { + fn with_failure_code(self, code: server::ErrorCode) -> super::Result { + self.map_err(|err| Error::new(err, code)) + } +} + +impl Error { + pub(crate) fn new(err: anyhow::Error, code: lsp_server::ErrorCode) -> Self { + Self { code, error: err } + } +} + +impl std::fmt::Debug for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.error.fmt(f) + } +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.error.fmt(f) + } +} diff --git a/crates/ruff_server/src/server/api/notifications.rs b/crates/ruff_server/src/server/api/notifications.rs new file mode 100644 index 00000000000000..1aa7e7a5113bc0 --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications.rs @@ -0,0 +1,17 @@ +mod cancel; +mod did_change; +mod did_change_configuration; +mod did_change_workspace; +mod did_close; +mod did_open; + +use super::{ + define_document_url, + traits::{Notification, SyncNotification}, +}; +pub(super) use cancel::Cancel; +pub(super) use did_change::DidChange; +pub(super) use did_change_configuration::DidChangeConfiguration; +pub(super) use did_change_workspace::DidChangeWorkspace; +pub(super) use did_close::DidClose; +pub(super) use did_open::DidOpen; diff --git a/crates/ruff_server/src/server/api/notifications/cancel.rs b/crates/ruff_server/src/server/api/notifications/cancel.rs new file mode 100644 index 00000000000000..789f9d8e561368 --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/cancel.rs @@ -0,0 +1,22 @@ +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct Cancel; + +impl super::Notification for Cancel { + type NotificationType = notif::Cancel; +} + +impl super::SyncNotification for Cancel { + #[tracing::instrument(skip_all)] + fn run( + _session: &mut Session, + _notifier: Notifier, + _params: types::CancelParams, + ) -> Result<()> { + Ok(()) + } +} diff --git a/crates/ruff_server/src/server/api/notifications/did_change.rs b/crates/ruff_server/src/server/api/notifications/did_change.rs new file mode 100644 index 00000000000000..5c39668fbb99f6 --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/did_change.rs @@ -0,0 +1,39 @@ +use crate::server::api::LSPResult; +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct DidChange; + +impl super::Notification for DidChange { + type NotificationType = notif::DidChangeTextDocument; +} + +impl super::SyncNotification for DidChange { + #[tracing::instrument(skip_all, fields(file=%params.text_document.uri))] + fn run( + session: &mut Session, + _notifier: Notifier, + params: types::DidChangeTextDocumentParams, + ) -> Result<()> { + super::define_document_url!(params: &types::DidChangeTextDocumentParams); + + if params.content_changes.is_empty() { + return Ok(()); + } + + let new_version = params.text_document.version; + + let encoding = session.encoding(); + + let document = session + .document_controller(document_url(¶ms)) + .with_failure_code(lsp_server::ErrorCode::InvalidParams)?; + + document.apply_changes(params.content_changes, new_version, encoding); + + Ok(()) + } +} diff --git a/crates/ruff_server/src/server/api/notifications/did_change_configuration.rs b/crates/ruff_server/src/server/api/notifications/did_change_configuration.rs new file mode 100644 index 00000000000000..c111cb148efaca --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/did_change_configuration.rs @@ -0,0 +1,22 @@ +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct DidChangeConfiguration; + +impl super::Notification for DidChangeConfiguration { + type NotificationType = notif::DidChangeConfiguration; +} + +impl super::SyncNotification for DidChangeConfiguration { + fn run( + _session: &mut Session, + _notifier: Notifier, + _params: types::DidChangeConfigurationParams, + ) -> Result<()> { + // TODO(jane): get this wired up after the pre-release + Ok(()) + } +} diff --git a/crates/ruff_server/src/server/api/notifications/did_change_workspace.rs b/crates/ruff_server/src/server/api/notifications/did_change_workspace.rs new file mode 100644 index 00000000000000..b40d27f8e161fb --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/did_change_workspace.rs @@ -0,0 +1,32 @@ +use crate::server::api::LSPResult; +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct DidChangeWorkspace; + +impl super::Notification for DidChangeWorkspace { + type NotificationType = notif::DidChangeWorkspaceFolders; +} + +impl super::SyncNotification for DidChangeWorkspace { + fn run( + session: &mut Session, + _notifier: Notifier, + params: types::DidChangeWorkspaceFoldersParams, + ) -> Result<()> { + for new in params.event.added { + session + .open_workspace_folder(&new.uri) + .with_failure_code(lsp_server::ErrorCode::InvalidParams)?; + } + for removed in params.event.removed { + session + .close_workspace_folder(&removed.uri) + .with_failure_code(lsp_server::ErrorCode::InvalidParams)?; + } + Ok(()) + } +} diff --git a/crates/ruff_server/src/server/api/notifications/did_close.rs b/crates/ruff_server/src/server/api/notifications/did_close.rs new file mode 100644 index 00000000000000..60f646b9a47838 --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/did_close.rs @@ -0,0 +1,26 @@ +use crate::server::api::LSPResult; +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct DidClose; + +impl super::Notification for DidClose { + type NotificationType = notif::DidCloseTextDocument; +} + +impl super::SyncNotification for DidClose { + #[tracing::instrument(skip_all, fields(file=%params.text_document.uri))] + fn run( + session: &mut Session, + _notifier: Notifier, + params: types::DidCloseTextDocumentParams, + ) -> Result<()> { + super::define_document_url!(params: &types::DidCloseTextDocumentParams); + session + .close_document(document_url(¶ms)) + .with_failure_code(lsp_server::ErrorCode::InternalError) + } +} diff --git a/crates/ruff_server/src/server/api/notifications/did_open.rs b/crates/ruff_server/src/server/api/notifications/did_open.rs new file mode 100644 index 00000000000000..68e79b1081a06a --- /dev/null +++ b/crates/ruff_server/src/server/api/notifications/did_open.rs @@ -0,0 +1,31 @@ +use crate::server::client::Notifier; +use crate::server::Result; +use crate::session::Session; +use lsp_types as types; +use lsp_types::notification as notif; + +pub(crate) struct DidOpen; + +impl super::Notification for DidOpen { + type NotificationType = notif::DidOpenTextDocument; +} + +impl super::SyncNotification for DidOpen { + #[tracing::instrument(skip_all, fields(file=%url))] + fn run( + session: &mut Session, + _notifier: Notifier, + types::DidOpenTextDocumentParams { + text_document: + types::TextDocumentItem { + uri: ref url, + text, + version, + .. + }, + }: types::DidOpenTextDocumentParams, + ) -> Result<()> { + session.open_document(url, text, version); + Ok(()) + } +} diff --git a/crates/ruff_server/src/server/api/requests.rs b/crates/ruff_server/src/server/api/requests.rs new file mode 100644 index 00000000000000..28f2d2853fd46c --- /dev/null +++ b/crates/ruff_server/src/server/api/requests.rs @@ -0,0 +1,15 @@ +mod code_action; +mod diagnostic; +mod format; +mod format_range; + +use super::{ + define_document_url, + traits::{BackgroundRequest, Request}, +}; +pub(super) use code_action::CodeAction; +pub(super) use diagnostic::Diagnostic; +pub(super) use format::Format; +pub(super) use format_range::FormatRange; + +type FormatResponse = Option>; diff --git a/crates/ruff_server/src/server/api/requests/code_action.rs b/crates/ruff_server/src/server/api/requests/code_action.rs new file mode 100644 index 00000000000000..28a22ace6d6219 --- /dev/null +++ b/crates/ruff_server/src/server/api/requests/code_action.rs @@ -0,0 +1,69 @@ +use crate::edit::text_range_to_range; +use crate::server::{client::Notifier, Result}; +use crate::session::SessionSnapshot; +use lsp_types::{self as types, request as req}; +use ruff_text_size::Ranged; + +pub(crate) struct CodeAction; + +impl super::Request for CodeAction { + type RequestType = req::CodeActionRequest; +} + +impl super::BackgroundRequest for CodeAction { + super::define_document_url!(params: &types::CodeActionParams); + fn run_with_snapshot( + snapshot: SessionSnapshot, + _notifier: Notifier, + params: types::CodeActionParams, + ) -> Result> { + let document = snapshot.document(); + let url = snapshot.url(); + let encoding = snapshot.encoding(); + let version = document.version(); + let actions = params + .context + .diagnostics + .into_iter() + .filter_map(|diagnostic| { + let diagnostic_fix: crate::lint::DiagnosticFix = + serde_json::from_value(diagnostic.data?).ok()?; + let edits = diagnostic_fix + .fix + .edits() + .iter() + .map(|edit| types::TextEdit { + range: text_range_to_range(edit.range(), document, encoding), + new_text: edit.content().unwrap_or_default().to_string(), + }); + + let changes = vec![types::TextDocumentEdit { + text_document: types::OptionalVersionedTextDocumentIdentifier::new( + url.clone(), + version, + ), + edits: edits.map(types::OneOf::Left).collect(), + }]; + + let title = diagnostic_fix + .kind + .suggestion + .unwrap_or(diagnostic_fix.kind.name); + Some(types::CodeAction { + title, + kind: Some(types::CodeActionKind::QUICKFIX), + edit: Some(types::WorkspaceEdit { + document_changes: Some(types::DocumentChanges::Edits(changes)), + ..Default::default() + }), + ..Default::default() + }) + }); + + Ok(Some( + actions + .map(types::CodeActionOrCommand::CodeAction) + .collect(), + )) + } +} diff --git a/crates/ruff_server/src/server/api/requests/diagnostic.rs b/crates/ruff_server/src/server/api/requests/diagnostic.rs new file mode 100644 index 00000000000000..13f95a453cad95 --- /dev/null +++ b/crates/ruff_server/src/server/api/requests/diagnostic.rs @@ -0,0 +1,38 @@ +use crate::server::{client::Notifier, Result}; +use crate::session::SessionSnapshot; +use lsp_types::{self as types, request as req}; +use types::{ + DocumentDiagnosticReportResult, FullDocumentDiagnosticReport, + RelatedFullDocumentDiagnosticReport, +}; + +pub(crate) struct Diagnostic; + +impl super::Request for Diagnostic { + type RequestType = req::DocumentDiagnosticRequest; +} + +impl super::BackgroundRequest for Diagnostic { + super::define_document_url!(params: &types::DocumentDiagnosticParams); + fn run_with_snapshot( + snapshot: SessionSnapshot, + _notifier: Notifier, + _params: types::DocumentDiagnosticParams, + ) -> Result { + let diagnostics = crate::lint::check( + snapshot.document(), + &snapshot.configuration().linter, + snapshot.encoding(), + ); + + Ok(DocumentDiagnosticReportResult::Report( + types::DocumentDiagnosticReport::Full(RelatedFullDocumentDiagnosticReport { + related_documents: None, + full_document_diagnostic_report: FullDocumentDiagnosticReport { + result_id: None, + items: diagnostics, + }, + }), + )) + } +} diff --git a/crates/ruff_server/src/server/api/requests/format.rs b/crates/ruff_server/src/server/api/requests/format.rs new file mode 100644 index 00000000000000..622621916ae597 --- /dev/null +++ b/crates/ruff_server/src/server/api/requests/format.rs @@ -0,0 +1,36 @@ +use crate::edit::text_range_to_range; +use crate::server::api::LSPResult; +use crate::server::{client::Notifier, Result}; +use crate::session::SessionSnapshot; +use lsp_types::{self as types, request as req}; +use ruff_text_size::{TextRange, TextSize}; +use types::TextEdit; + +pub(crate) struct Format; + +impl super::Request for Format { + type RequestType = req::Formatting; +} + +impl super::BackgroundRequest for Format { + super::define_document_url!(params: &types::DocumentFormattingParams); + fn run_with_snapshot( + snapshot: SessionSnapshot, + _notifier: Notifier, + _params: types::DocumentFormattingParams, + ) -> Result { + let code = crate::format::format(snapshot.document(), &snapshot.configuration().formatter) + .with_failure_code(lsp_server::ErrorCode::InternalError)?; + + let doc_size = TextSize::of(snapshot.document().contents()); + // TODO(jane): Can we try breaking this up into individual text edits instead of replacing the whole document? + Ok(Some(vec![TextEdit { + range: text_range_to_range( + TextRange::up_to(doc_size), + snapshot.document(), + snapshot.encoding(), + ), + new_text: code, + }])) + } +} diff --git a/crates/ruff_server/src/server/api/requests/format_range.rs b/crates/ruff_server/src/server/api/requests/format_range.rs new file mode 100644 index 00000000000000..c59012ee173b24 --- /dev/null +++ b/crates/ruff_server/src/server/api/requests/format_range.rs @@ -0,0 +1,40 @@ +use crate::edit::{text_range, text_range_to_range}; +use crate::server::api::LSPResult; +use crate::server::{client::Notifier, Result}; +use crate::session::SessionSnapshot; +use lsp_types::{self as types, request as req}; + +pub(crate) struct FormatRange; + +impl super::Request for FormatRange { + type RequestType = req::RangeFormatting; +} + +impl super::BackgroundRequest for FormatRange { + super::define_document_url!(params: &types::DocumentRangeFormattingParams); + fn run_with_snapshot( + snapshot: SessionSnapshot, + _notifier: Notifier, + params: types::DocumentRangeFormattingParams, + ) -> Result { + let document = snapshot.document(); + let range = text_range( + params.range, + document.contents(), + document.index(), + snapshot.encoding(), + ); + let formatted_range = + crate::format::range_format(document, &snapshot.configuration().formatter, range) + .with_failure_code(lsp_server::ErrorCode::InternalError)?; + + Ok(Some(vec![types::TextEdit { + range: text_range_to_range( + formatted_range.source_range(), + document, + snapshot.encoding(), + ), + new_text: formatted_range.into_code(), + }])) + } +} diff --git a/crates/ruff_server/src/server/api/traits.rs b/crates/ruff_server/src/server/api/traits.rs new file mode 100644 index 00000000000000..ce4b10fd62f242 --- /dev/null +++ b/crates/ruff_server/src/server/api/traits.rs @@ -0,0 +1,142 @@ +//! A stateful LSP implementation that calls into the Ruff API. + +use crate::server::client::{Notifier, Responder}; +use crate::session::{Session, SessionSnapshot}; + +use lsp_server as server; +use lsp_types::notification::Notification as LSPNotification; +use lsp_types::request::Request as LSPRequest; + +/// A supertrait for any server request handler. +pub(super) trait Request { + type RequestType: LSPRequest; + const METHOD: &'static str = <::RequestType as LSPRequest>::METHOD; + + /// Tries to cast a serialized request from the server into + /// a parameter type for a specific request handler. + /// It is *highly* recommended to not override this function in your + /// implementation. + fn cast( + request: server::Request, + ) -> std::result::Result< + ( + lsp_server::RequestId, + <::RequestType as LSPRequest>::Params, + ), + server::ExtractError, + > { + request.extract(Self::METHOD) + } + + /// Sends back a response to the server using a [`Responder`]. + /// `R` should be the expected response type for this request. + /// It is *highly* recommended to not override this function in your + /// implementation. + fn respond( + id: lsp_server::RequestId, + result: crate::server::Result, + responder: &Responder, + ) where + R: serde::Serialize, + { + if let Err(err) = responder.respond(id, result) { + tracing::error!("Failed to send response: {err}"); + } + } +} + +/// A request handler that needs mutable access to the session. +/// This will block the main message receiver loop, meaning that no +/// incoming requests or notifications will be handled while `run` is +/// executing. Try to avoid doing any I/O or long computations. +pub(super) trait SyncRequest: Request { + fn run( + session: &mut Session, + notifier: Notifier, + params: <::RequestType as LSPRequest>::Params, + ) -> super::Result<<::RequestType as LSPRequest>::Result>; +} + +/// A request handler that can be run on a background thread. +/// `document_url` can be implemented automatically with +/// `define_document_url!(params: &)` in the trait +/// implementation. +pub(super) trait BackgroundRequest: Request { + fn document_url( + params: &<::RequestType as LSPRequest>::Params, + ) -> &lsp_types::Url; + + fn run_with_snapshot( + snapshot: SessionSnapshot, + notifier: Notifier, + params: <::RequestType as LSPRequest>::Params, + ) -> super::Result<<::RequestType as LSPRequest>::Result>; +} + +/// A supertrait for any server notification handler. +pub(super) trait Notification { + type NotificationType: LSPNotification; + const METHOD: &'static str = + <::NotificationType as LSPNotification>::METHOD; + + /// Tries to cast a serialized request from the server into + /// a parameter type for a specific request handler. + /// It is *highly* recommended to not override this function in your + /// implementation. + fn cast( + notification: server::Notification, + ) -> std::result::Result< + ( + String, + <::NotificationType as LSPNotification>::Params, + ), + server::ExtractError, + > { + Ok(( + Self::METHOD.to_string(), + notification.extract(Self::METHOD)?, + )) + } + + /// This is not supposed to do anything besides reporting errors, since + /// notifications don't need send anything back to the client. + /// [`Notification`] needs this method in order to have function name compatibility + /// with [`Request`]. + /// It is *highly* recommended to not override this function in your + /// implementation. + fn respond(method: String, result: crate::server::Result<()>, _responder: &Responder) { + if let Err(err) = result { + tracing::error!("Background notification failed: {err}"); + } else { + tracing::debug!("`{method}` notification handler finished successfully"); + } + } +} + +/// A notification handler that needs mutable access to the session. +/// This will block the main message receiver loop, meaning that no +/// incoming requests or notifications will be handled while `run` is +/// executing. Try to avoid doing any I/O or long computations. +pub(super) trait SyncNotification: Notification { + fn run( + session: &mut Session, + notifier: Notifier, + params: <::NotificationType as LSPNotification>::Params, + ) -> super::Result<()>; +} + +/// A notification handler that can be run on a background thread. +/// `document_url` can be implemented automatically with +/// `define_document_url!(params: &)` in the trait +/// implementation. +pub(super) trait BackgroundNotification: Notification { + fn document_url( + params: &<::NotificationType as LSPNotification>::Params, + ) -> &lsp_types::Url; + + fn run_with_snapshot( + snapshot: SessionSnapshot, + notifier: Notifier, + params: <::NotificationType as LSPNotification>::Params, + ) -> super::Result<()>; +} diff --git a/crates/ruff_server/src/server/client.rs b/crates/ruff_server/src/server/client.rs new file mode 100644 index 00000000000000..a385632c50814e --- /dev/null +++ b/crates/ruff_server/src/server/client.rs @@ -0,0 +1,77 @@ +use lsp_server::{Notification, RequestId}; +use serde_json::Value; + +pub(crate) type ClientSender = crossbeam::channel::Sender; + +/// +pub(crate) struct Client { + notifier: Notifier, + responder: Responder, +} + +#[derive(Clone)] +pub(crate) struct Notifier(ClientSender); + +#[derive(Clone)] +pub(crate) struct Responder(ClientSender); + +impl Client { + pub(in crate::server) fn new(sender: &ClientSender) -> Self { + Self { + notifier: Notifier(sender.clone()), + responder: Responder(sender.clone()), + } + } + + pub(super) fn notifier(&self) -> Notifier { + self.notifier.clone() + } + + pub(super) fn responder(&self) -> Responder { + self.responder.clone() + } +} + +#[allow(dead_code)] // we'll need to use `Notifier` in the future +impl Notifier { + pub(crate) fn notify(&self, params: N::Params) -> crate::Result<()> + where + N: lsp_types::notification::Notification, + { + let method = N::METHOD.to_string(); + + let message = lsp_server::Message::Notification(Notification::new(method, params)); + + Ok(self.0.send(message)?) + } + + pub(crate) fn notify_method(&self, method: String) -> crate::Result<()> { + Ok(self + .0 + .send(lsp_server::Message::Notification(Notification::new( + method, + Value::Null, + )))?) + } +} + +impl Responder { + pub(crate) fn respond( + &self, + id: RequestId, + result: crate::server::Result, + ) -> crate::Result<()> + where + R: serde::Serialize, + { + Ok(self.0.send( + match result { + Ok(res) => lsp_server::Response::new_ok(id, res), + Err(crate::server::api::Error { code, error }) => { + lsp_server::Response::new_err(id, code as i32, format!("{error}")) + } + } + .into(), + )?) + } +} diff --git a/crates/ruff_server/src/server/schedule.rs b/crates/ruff_server/src/server/schedule.rs new file mode 100644 index 00000000000000..a6d9eede347327 --- /dev/null +++ b/crates/ruff_server/src/server/schedule.rs @@ -0,0 +1,96 @@ +use crossbeam::channel::Sender; + +use crate::session::Session; + +mod task; +mod thread; + +pub(super) use task::Task; + +use self::{ + task::{BackgroundSchedule, BackgroundTaskBuilder, SyncTask}, + thread::ThreadPriority, +}; + +use super::client::Client; + +/// The main thread is actually a secondary thread that we spawn from the +/// _actual_ main thread. This secondary thread has a larger stack size +/// than some OS defaults (Windows, for example) and is also designated as +/// high-priority. +pub(crate) fn main_thread( + func: impl FnOnce() -> crate::Result<()> + Send + 'static, +) -> crate::Result>> { + const MAIN_THREAD_STACK_SIZE: usize = 2 * 1024 * 1024; + const MAIN_THREAD_NAME: &str = "ruff:main"; + Ok( + thread::Builder::new(thread::ThreadPriority::LatencySensitive) + .name(MAIN_THREAD_NAME.into()) + .stack_size(MAIN_THREAD_STACK_SIZE) + .spawn(func)?, + ) +} + +pub(crate) struct Scheduler { + session: Session, + client: Client, + fmt_pool: thread::Pool, + background_pool: thread::Pool, +} + +impl Scheduler { + pub(super) fn new( + session: Session, + thread_count: usize, + sender: &Sender, + ) -> Self { + Self { + session, + fmt_pool: thread::Pool::new(1), + background_pool: thread::Pool::new(thread_count), + client: Client::new(sender), + } + } + + /// This is executed lazily when a new message is received, but it will run before + /// any message handling logic. + pub(super) fn process_events(&mut self) { + // TODO: figure out how to notify client to run a diagnostic refresh. + // We might need to push diagnostics with notifications to begin with. + self.session.update_configuration_files(); + } + + /// Dispatches a `task` by either running it as a blocking function or + /// executing it on a background thread pool. + pub(super) fn dispatch<'s>(&'s mut self, task: task::Task<'s>) { + match task { + Task::Sync(SyncTask { func }) => { + func( + &mut self.session, + self.client.notifier(), + self.client.responder(), + ); + } + Task::Background(BackgroundTaskBuilder { + schedule, + builder: func, + }) => { + let static_func = func(&self.session); + let notifier = self.client.notifier(); + let responder = self.client.responder(); + let task = move || static_func(notifier, responder); + match schedule { + BackgroundSchedule::Worker => { + self.background_pool.spawn(ThreadPriority::Worker, task); + } + BackgroundSchedule::LatencySensitive => self + .background_pool + .spawn(ThreadPriority::LatencySensitive, task), + BackgroundSchedule::Fmt => { + self.fmt_pool.spawn(ThreadPriority::LatencySensitive, task); + } + } + } + } + } +} diff --git a/crates/ruff_server/src/server/schedule/task.rs b/crates/ruff_server/src/server/schedule/task.rs new file mode 100644 index 00000000000000..a6ac500c5ca635 --- /dev/null +++ b/crates/ruff_server/src/server/schedule/task.rs @@ -0,0 +1,111 @@ +use lsp_server::RequestId; +use serde::Serialize; + +use crate::{ + server::client::{Notifier, Responder}, + session::Session, +}; + +type LocalFn<'s> = Box; + +type BackgroundFn = Box; + +type BackgroundFnBuilder<'s> = Box BackgroundFn + 's>; + +/// Describes how the task should be run. +#[derive(Clone, Copy, Debug, Default)] +pub(super) enum BackgroundSchedule { + /// The task should be run on the background thread designated + /// for formatting actions. This is a high priority thread. + Fmt, + /// The task should be run on the general high-priority background + /// thread. + LatencySensitive, + /// The task should be run on a regular-priority background thread. + #[default] + Worker, +} + +/// A [`Task`] is a future that has not yet started, and it is the job of +/// the [`super::Scheduler`] to make that happen, via [`super::Scheduler::dispatch`]. +/// A task can either run on the main thread (in other words, the same thread as the +/// scheduler) or it can run in a background thread. The main difference between +/// the two is that background threads only have a read-only snapshot of the session, +/// while local tasks have exclusive access and can modify it as they please. Keep in mind that +/// local tasks will **block** the main event loop, so only use local tasks if you **need** +/// mutable state access or you need the absolute lowest latency possible. +pub(in crate::server) enum Task<'s> { + Background(BackgroundTaskBuilder<'s>), + Sync(SyncTask<'s>), +} + +// The reason why this isn't just a 'static background closure +// is because we need to take a snapshot of the session before sending +// this task to the background, and the inner closure can't take the session +// as an immutable reference since it's used mutably elsewhere. So instead, +// a background task is built using an outer closure that borrows the session to take a snapshot, +// that the inner closure can capture. This builder closure has a lifetime linked to the scheduler. +// When the task is dispatched, the scheduler runs the synchronous builder, which takes the session +// as a reference, to create the inner 'static closure. That closure is then moved to a background task pool. +pub(in crate::server) struct BackgroundTaskBuilder<'s> { + pub(super) schedule: BackgroundSchedule, + pub(super) builder: BackgroundFnBuilder<'s>, +} + +pub(in crate::server) struct SyncTask<'s> { + pub(super) func: LocalFn<'s>, +} + +impl<'s> Task<'s> { + /// Creates a new background task. + #[allow(dead_code)] // TODO: remove once we have a regular background task in our implementation + pub(crate) fn background_thread( + func: impl FnOnce(&Session) -> Box + 's, + ) -> Self { + Self::Background(BackgroundTaskBuilder { + schedule: BackgroundSchedule::Worker, + builder: Box::new(func), + }) + } + /// Creates a new high-priority background task. + pub(crate) fn low_latency_thread( + func: impl FnOnce(&Session) -> Box + 's, + ) -> Self { + Self::Background(BackgroundTaskBuilder { + schedule: BackgroundSchedule::LatencySensitive, + builder: Box::new(func), + }) + } + /// Creates a new high-priority background task, + /// designated for the formatting thread. + pub(crate) fn fmt_thread( + func: impl FnOnce(&Session) -> Box + 's, + ) -> Self { + Self::Background(BackgroundTaskBuilder { + schedule: BackgroundSchedule::Fmt, + builder: Box::new(func), + }) + } + /// Creates a new local background task. + pub(crate) fn local(func: impl FnOnce(&mut Session, Notifier, Responder) + 's) -> Self { + Self::Sync(SyncTask { + func: Box::new(func), + }) + } + /// Creates a background task that immediately + /// responds with the provided `request`. + pub(crate) fn immediate(id: RequestId, result: crate::server::Result) -> Self + where + R: Serialize + Send + 'static, + { + Self::local(move |_, _, responder| { + if let Err(err) = responder.respond(id, result) { + tracing::error!("Unable to send immediate response: {err}"); + } + }) + } + /// Creates a background task that does nothing. + pub(crate) fn nothing() -> Self { + Self::local(move |_, _, _| {}) + } +} diff --git a/crates/ruff_server/src/server/schedule/thread.rs b/crates/ruff_server/src/server/schedule/thread.rs new file mode 100644 index 00000000000000..da3ea8c2f2036a --- /dev/null +++ b/crates/ruff_server/src/server/schedule/thread.rs @@ -0,0 +1,109 @@ +// +------------------------------------------------------------+ +// | Code adopted from: | +// | Repository: https://github.com/rust-lang/rust-analyzer.git | +// | File: `crates/stdx/src/thread.rs` | +// | Commit: 03b3cb6be9f21c082f4206b35c7fe7f291c94eaa | +// +------------------------------------------------------------+ +//! A utility module for working with threads that automatically joins threads upon drop +//! and abstracts over operating system quality of service (QoS) APIs +//! through the concept of a “thread priority”. +//! +//! The priority of a thread is frozen at thread creation time, +//! i.e. there is no API to change the priority of a thread once it has been spawned. +//! +//! As a system, rust-analyzer should have the property that +//! old manual scheduling APIs are replaced entirely by QoS. +//! To maintain this invariant, we panic when it is clear that +//! old scheduling APIs have been used. +//! +//! Moreover, we also want to ensure that every thread has an priority set explicitly +//! to force a decision about its importance to the system. +//! Thus, [`ThreadPriority`] has no default value +//! and every entry point to creating a thread requires a [`ThreadPriority`] upfront. + +// Keeps us from getting warnings about the word `QoS` +#![allow(clippy::doc_markdown)] + +use std::fmt; + +mod pool; +mod priority; + +pub(super) use pool::Pool; +pub(super) use priority::ThreadPriority; + +pub(super) struct Builder { + priority: ThreadPriority, + inner: jod_thread::Builder, +} + +impl Builder { + pub(super) fn new(priority: ThreadPriority) -> Builder { + Builder { + priority, + inner: jod_thread::Builder::new(), + } + } + + pub(super) fn name(self, name: String) -> Builder { + Builder { + inner: self.inner.name(name), + ..self + } + } + + pub(super) fn stack_size(self, size: usize) -> Builder { + Builder { + inner: self.inner.stack_size(size), + ..self + } + } + + pub(super) fn spawn(self, f: F) -> std::io::Result> + where + F: FnOnce() -> T, + F: Send + 'static, + T: Send + 'static, + { + let inner_handle = self.inner.spawn(move || { + self.priority.apply_to_current_thread(); + f() + })?; + + Ok(JoinHandle { + inner: Some(inner_handle), + allow_leak: false, + }) + } +} + +pub(crate) struct JoinHandle { + // `inner` is an `Option` so that we can + // take ownership of the contained `JoinHandle`. + inner: Option>, + allow_leak: bool, +} + +impl JoinHandle { + pub(crate) fn join(mut self) -> T { + self.inner.take().unwrap().join() + } +} + +impl Drop for JoinHandle { + fn drop(&mut self) { + if !self.allow_leak { + return; + } + + if let Some(join_handle) = self.inner.take() { + join_handle.detach(); + } + } +} + +impl fmt::Debug for JoinHandle { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.pad("JoinHandle { .. }") + } +} diff --git a/crates/ruff_server/src/server/schedule/thread/pool.rs b/crates/ruff_server/src/server/schedule/thread/pool.rs new file mode 100644 index 00000000000000..78089d6e15845b --- /dev/null +++ b/crates/ruff_server/src/server/schedule/thread/pool.rs @@ -0,0 +1,106 @@ +// +------------------------------------------------------------+ +// | Code adopted from: | +// | Repository: https://github.com/rust-lang/rust-analyzer.git | +// | File: `crates/stdx/src/thread/pool.rs` | +// | Commit: 03b3cb6be9f21c082f4206b35c7fe7f291c94eaa | +// +------------------------------------------------------------+ +//! [`Pool`] implements a basic custom thread pool +//! inspired by the [`threadpool` crate](http://docs.rs/threadpool). +//! When you spawn a task you specify a thread priority +//! so the pool can schedule it to run on a thread with that priority. +//! rust-analyzer uses this to prioritize work based on latency requirements. +//! +//! The thread pool is implemented entirely using +//! the threading utilities in [`crate::server::schedule::thread`]. + +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, +}; + +use crossbeam::channel::{Receiver, Sender}; + +use super::{Builder, JoinHandle, ThreadPriority}; + +pub(crate) struct Pool { + // `_handles` is never read: the field is present + // only for its `Drop` impl. + + // The worker threads exit once the channel closes; + // make sure to keep `job_sender` above `handles` + // so that the channel is actually closed + // before we join the worker threads! + job_sender: Sender, + _handles: Vec, + extant_tasks: Arc, +} + +struct Job { + requested_priority: ThreadPriority, + f: Box, +} + +impl Pool { + pub(crate) fn new(threads: usize) -> Pool { + const STACK_SIZE: usize = 8 * 1024 * 1024; + const INITIAL_PRIORITY: ThreadPriority = ThreadPriority::Worker; + + let (job_sender, job_receiver) = crossbeam::channel::unbounded(); + let extant_tasks = Arc::new(AtomicUsize::new(0)); + + let mut handles = Vec::with_capacity(threads); + for _ in 0..threads { + let handle = Builder::new(INITIAL_PRIORITY) + .stack_size(STACK_SIZE) + .name("Worker".into()) + .spawn({ + let extant_tasks = Arc::clone(&extant_tasks); + let job_receiver: Receiver = job_receiver.clone(); + move || { + let mut current_priority = INITIAL_PRIORITY; + for job in job_receiver { + if job.requested_priority != current_priority { + job.requested_priority.apply_to_current_thread(); + current_priority = job.requested_priority; + } + extant_tasks.fetch_add(1, Ordering::SeqCst); + (job.f)(); + extant_tasks.fetch_sub(1, Ordering::SeqCst); + } + } + }) + .expect("failed to spawn thread"); + + handles.push(handle); + } + + Pool { + _handles: handles, + extant_tasks, + job_sender, + } + } + + pub(crate) fn spawn(&self, priority: ThreadPriority, f: F) + where + F: FnOnce() + Send + 'static, + { + let f = Box::new(move || { + if cfg!(debug_assertions) { + priority.assert_is_used_on_current_thread(); + } + f(); + }); + + let job = Job { + requested_priority: priority, + f, + }; + self.job_sender.send(job).unwrap(); + } + + #[allow(dead_code)] + pub(super) fn len(&self) -> usize { + self.extant_tasks.load(Ordering::SeqCst) + } +} diff --git a/crates/ruff_server/src/server/schedule/thread/priority.rs b/crates/ruff_server/src/server/schedule/thread/priority.rs new file mode 100644 index 00000000000000..fa21e4079e929e --- /dev/null +++ b/crates/ruff_server/src/server/schedule/thread/priority.rs @@ -0,0 +1,300 @@ +// +------------------------------------------------------------+ +// | Code adopted from: | +// | Repository: https://github.com/rust-lang/rust-analyzer.git | +// | File: `crates/stdx/src/thread/intent.rs` | +// | Commit: 03b3cb6be9f21c082f4206b35c7fe7f291c94eaa | +// +------------------------------------------------------------+ +//! An opaque façade around platform-specific QoS APIs. + +// Keeps us from getting warnings about the word `QoS` +#![allow(clippy::doc_markdown)] + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +// Please maintain order from least to most priority for the derived `Ord` impl. +pub(crate) enum ThreadPriority { + /// Any thread which does work that isn't in a critical path. + Worker, + + /// Any thread which does work caused by the user typing, or + /// work that the editor may wait on. + LatencySensitive, +} + +impl ThreadPriority { + // These APIs must remain private; + // we only want consumers to set thread priority + // during thread creation. + + pub(crate) fn apply_to_current_thread(self) { + let class = thread_priority_to_qos_class(self); + set_current_thread_qos_class(class); + } + + pub(crate) fn assert_is_used_on_current_thread(self) { + if IS_QOS_AVAILABLE { + let class = thread_priority_to_qos_class(self); + assert_eq!(get_current_thread_qos_class(), Some(class)); + } + } +} + +use imp::QoSClass; + +const IS_QOS_AVAILABLE: bool = imp::IS_QOS_AVAILABLE; + +fn set_current_thread_qos_class(class: QoSClass) { + imp::set_current_thread_qos_class(class); +} + +fn get_current_thread_qos_class() -> Option { + imp::get_current_thread_qos_class() +} + +fn thread_priority_to_qos_class(priority: ThreadPriority) -> QoSClass { + imp::thread_priority_to_qos_class(priority) +} + +// All Apple platforms use XNU as their kernel +// and thus have the concept of QoS. +#[cfg(target_vendor = "apple")] +mod imp { + use super::ThreadPriority; + + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] + // Please maintain order from least to most priority for the derived `Ord` impl. + pub(super) enum QoSClass { + // Documentation adapted from https://github.com/apple-oss-distributions/libpthread/blob/67e155c94093be9a204b69637d198eceff2c7c46/include/sys/qos.h#L55 + // + /// TLDR: invisible maintenance tasks + /// + /// Contract: + /// + /// * **You do not care about how long it takes for work to finish.** + /// * **You do not care about work being deferred temporarily.** + /// (e.g. if the device's battery is in a critical state) + /// + /// Examples: + /// + /// * in a video editor: + /// creating periodic backups of project files + /// * in a browser: + /// cleaning up cached sites which have not been accessed in a long time + /// * in a collaborative word processor: + /// creating a searchable index of all documents + /// + /// Use this QoS class for background tasks + /// which the user did not initiate themselves + /// and which are invisible to the user. + /// It is expected that this work will take significant time to complete: + /// minutes or even hours. + /// + /// This QoS class provides the most energy and thermally-efficient execution possible. + /// All other work is prioritized over background tasks. + Background, + + /// TLDR: tasks that don't block using your app + /// + /// Contract: + /// + /// * **Your app remains useful even as the task is executing.** + /// + /// Examples: + /// + /// * in a video editor: + /// exporting a video to disk - + /// the user can still work on the timeline + /// * in a browser: + /// automatically extracting a downloaded zip file - + /// the user can still switch tabs + /// * in a collaborative word processor: + /// downloading images embedded in a document - + /// the user can still make edits + /// + /// Use this QoS class for tasks which + /// may or may not be initiated by the user, + /// but whose result is visible. + /// It is expected that this work will take a few seconds to a few minutes. + /// Typically your app will include a progress bar + /// for tasks using this class. + /// + /// This QoS class provides a balance between + /// performance, responsiveness and efficiency. + Utility, + + /// TLDR: tasks that block using your app + /// + /// Contract: + /// + /// * **You need this work to complete + /// before the user can keep interacting with your app.** + /// * **Your work will not take more than a few seconds to complete.** + /// + /// Examples: + /// + /// * in a video editor: + /// opening a saved project + /// * in a browser: + /// loading a list of the user's bookmarks and top sites + /// when a new tab is created + /// * in a collaborative word processor: + /// running a search on the document's content + /// + /// Use this QoS class for tasks which were initiated by the user + /// and block the usage of your app while they are in progress. + /// It is expected that this work will take a few seconds or less to complete; + /// not long enough to cause the user to switch to something else. + /// Your app will likely indicate progress on these tasks + /// through the display of placeholder content or modals. + /// + /// This QoS class is not energy-efficient. + /// Rather, it provides responsiveness + /// by prioritizing work above other tasks on the system + /// except for critical user-interactive work. + UserInitiated, + + /// TLDR: render loops and nothing else + /// + /// Contract: + /// + /// * **You absolutely need this work to complete immediately + /// or your app will appear to freeze.** + /// * **Your work will always complete virtually instantaneously.** + /// + /// Examples: + /// + /// * the main thread in a GUI application + /// * the update & render loop in a game + /// * a secondary thread which progresses an animation + /// + /// Use this QoS class for any work which, if delayed, + /// will make your user interface unresponsive. + /// It is expected that this work will be virtually instantaneous. + /// + /// This QoS class is not energy-efficient. + /// Specifying this class is a request to run with + /// nearly all available system CPU and I/O bandwidth even under contention. + UserInteractive, + } + + pub(super) const IS_QOS_AVAILABLE: bool = true; + + pub(super) fn set_current_thread_qos_class(class: QoSClass) { + let c = match class { + QoSClass::UserInteractive => libc::qos_class_t::QOS_CLASS_USER_INTERACTIVE, + QoSClass::UserInitiated => libc::qos_class_t::QOS_CLASS_USER_INITIATED, + QoSClass::Utility => libc::qos_class_t::QOS_CLASS_UTILITY, + QoSClass::Background => libc::qos_class_t::QOS_CLASS_BACKGROUND, + }; + + #[allow(unsafe_code)] + let code = unsafe { libc::pthread_set_qos_class_self_np(c, 0) }; + + if code == 0 { + return; + } + + #[allow(unsafe_code)] + let errno = unsafe { *libc::__error() }; + + match errno { + libc::EPERM => { + // This thread has been excluded from the QoS system + // due to a previous call to a function such as `pthread_setschedparam` + // which is incompatible with QoS. + // + // Panic instead of returning an error + // to maintain the invariant that we only use QoS APIs. + panic!("tried to set QoS of thread which has opted out of QoS (os error {errno})") + } + + libc::EINVAL => { + // This is returned if we pass something other than a qos_class_t + // to `pthread_set_qos_class_self_np`. + // + // This is impossible, so again panic. + unreachable!( + "invalid qos_class_t value was passed to pthread_set_qos_class_self_np" + ) + } + + _ => { + // `pthread_set_qos_class_self_np`’s documentation + // does not mention any other errors. + unreachable!("`pthread_set_qos_class_self_np` returned unexpected error {errno}") + } + } + } + + pub(super) fn get_current_thread_qos_class() -> Option { + #[allow(unsafe_code)] + let current_thread = unsafe { libc::pthread_self() }; + let mut qos_class_raw = libc::qos_class_t::QOS_CLASS_UNSPECIFIED; + #[allow(unsafe_code)] + let code = unsafe { + libc::pthread_get_qos_class_np(current_thread, &mut qos_class_raw, std::ptr::null_mut()) + }; + + if code != 0 { + // `pthread_get_qos_class_np`’s documentation states that + // an error value is placed into errno if the return code is not zero. + // However, it never states what errors are possible. + // Inspecting the source[0] shows that, as of this writing, it always returns zero. + // + // Whatever errors the function could report in future are likely to be + // ones which we cannot handle anyway + // + // 0: https://github.com/apple-oss-distributions/libpthread/blob/67e155c94093be9a204b69637d198eceff2c7c46/src/qos.c#L171-L177 + #[allow(unsafe_code)] + let errno = unsafe { *libc::__error() }; + unreachable!("`pthread_get_qos_class_np` failed unexpectedly (os error {errno})"); + } + + match qos_class_raw { + libc::qos_class_t::QOS_CLASS_USER_INTERACTIVE => Some(QoSClass::UserInteractive), + libc::qos_class_t::QOS_CLASS_USER_INITIATED => Some(QoSClass::UserInitiated), + libc::qos_class_t::QOS_CLASS_DEFAULT => None, // QoS has never been set + libc::qos_class_t::QOS_CLASS_UTILITY => Some(QoSClass::Utility), + libc::qos_class_t::QOS_CLASS_BACKGROUND => Some(QoSClass::Background), + + libc::qos_class_t::QOS_CLASS_UNSPECIFIED => { + // Using manual scheduling APIs causes threads to “opt out” of QoS. + // At this point they become incompatible with QoS, + // and as such have the “unspecified” QoS class. + // + // Panic instead of returning an error + // to maintain the invariant that we only use QoS APIs. + panic!("tried to get QoS of thread which has opted out of QoS") + } + } + } + + pub(super) fn thread_priority_to_qos_class(priority: ThreadPriority) -> QoSClass { + match priority { + ThreadPriority::Worker => QoSClass::Utility, + ThreadPriority::LatencySensitive => QoSClass::UserInitiated, + } + } +} + +// FIXME: Windows has QoS APIs, we should use them! +#[cfg(not(target_vendor = "apple"))] +mod imp { + use super::ThreadPriority; + + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] + pub(super) enum QoSClass { + Default, + } + + pub(super) const IS_QOS_AVAILABLE: bool = false; + + pub(super) fn set_current_thread_qos_class(_: QoSClass) {} + + pub(super) fn get_current_thread_qos_class() -> Option { + None + } + + pub(super) fn thread_intent_to_qos_class(_: ThreadPriority) -> QoSClass { + QoSClass::Default + } +} diff --git a/crates/ruff_server/src/session.rs b/crates/ruff_server/src/session.rs new file mode 100644 index 00000000000000..0f1894bed1aaee --- /dev/null +++ b/crates/ruff_server/src/session.rs @@ -0,0 +1,402 @@ +//! Data model, state management, and configuration resolution. + +mod types; + +use std::collections::BTreeMap; +use std::ops::DerefMut; +use std::path::{Path, PathBuf}; +use std::{ops::Deref, sync::Arc}; + +use anyhow::anyhow; +use crossbeam::channel::{unbounded, Receiver}; +use lsp_types::{ServerCapabilities, Url}; +use notify::Watcher; +use ruff_workspace::resolver::{ConfigurationTransformer, Relativity}; +use rustc_hash::FxHashMap; + +use crate::edit::Document; +use crate::PositionEncoding; + +/// The global state for the LSP. +pub(crate) struct Session { + workspaces: Workspaces, + position_encoding: PositionEncoding, + #[allow(dead_code)] + lsp_settings: types::Settings, + watcher: notify::RecommendedWatcher, + watch_recv: Receiver>, +} + +/// An immutable snapshot of `Session` that references +/// a specific document. +pub(crate) struct SessionSnapshot { + configuration: Arc, + document_ref: DocumentRef, + position_encoding: PositionEncoding, + url: Url, +} + +#[derive(Default)] +pub(crate) struct Configuration { + // settings to pass into the ruff linter + pub(crate) linter: ruff_linter::settings::LinterSettings, + // settings to pass into the ruff formatter + pub(crate) formatter: ruff_workspace::FormatterSettings, +} + +#[derive(Default)] +pub(crate) struct Workspaces(BTreeMap); + +pub(crate) struct Workspace { + open_documents: OpenDocuments, + configuration: Arc, +} + +#[derive(Default)] +pub(crate) struct OpenDocuments { + documents: FxHashMap, +} + +/// A handler to an underlying document, with a revision counter. +pub(crate) struct DocumentController { + document: Arc, +} + +/// A read-only reference to a document. +#[derive(Clone)] +pub(crate) struct DocumentRef { + document: Arc, +} + +impl Session { + pub(crate) fn new( + server_capabilities: &ServerCapabilities, + workspaces: &[Url], + ) -> crate::Result { + let (tx, rx) = unbounded(); + let mut watcher = notify::recommended_watcher(tx)?; + let paths: Result, _> = workspaces.iter().map(Url::to_file_path).collect(); + for url in paths.map_err(|()| anyhow!("Workspace URL was not a valid file path"))? { + watcher.watch(&url, notify::RecursiveMode::Recursive)?; + } + Ok(Self { + position_encoding: server_capabilities + .position_encoding + .clone() + .and_then(|encoding| encoding.try_into().ok()) + .unwrap_or_default(), + lsp_settings: types::Settings::default(), + workspaces: Workspaces::new(workspaces)?, + watcher, + watch_recv: rx, + }) + } + pub(crate) fn take_snapshot(&self, url: &Url) -> Option { + Some(SessionSnapshot { + configuration: self.workspaces.configuration(url)?.clone(), + document_ref: self.workspaces.doc_snapshot(url)?, + position_encoding: self.position_encoding, + url: url.clone(), + }) + } + + pub(crate) fn open_document(&mut self, url: &Url, contents: String, version: i32) { + self.workspaces.open_document(url, contents, version); + } + + pub(crate) fn close_document(&mut self, url: &Url) -> crate::Result<()> { + self.workspaces.close_document(url)?; + Ok(()) + } + + pub(crate) fn document_controller( + &mut self, + url: &Url, + ) -> crate::Result<&mut DocumentController> { + self.workspaces + .doc_controller(url) + .ok_or_else(|| anyhow!("Tried to open unavailable document `{url}`")) + } + + /// Processes any file changes made since the last call and forwards each event + /// to the appropriate workspace, in the order that they were received. + /// Returns `true` if at least one configuration file was changed in at least + /// one workspace. + pub(crate) fn update_configuration_files(&mut self) -> bool { + let mut configuration_changed = false; + while let Ok(event) = self.watch_recv.try_recv() { + match event { + Ok(event) => { + configuration_changed |= self.workspaces.update_configuration_files(&event); + } + Err(err) => { + tracing::error!("An error occured with the workspace file watcher:\n{err}"); + } + } + } + configuration_changed + } + + pub(crate) fn open_workspace_folder(&mut self, url: &Url) -> crate::Result<()> { + self.workspaces.open_workspace_folder(url)?; + self.track_url(url); + Ok(()) + } + + pub(crate) fn close_workspace_folder(&mut self, url: &Url) -> crate::Result<()> { + self.workspaces.close_workspace_folder(url)?; + self.stop_tracking_url(url); + Ok(()) + } + + pub(crate) fn encoding(&self) -> PositionEncoding { + self.position_encoding + } + + fn track_url(&mut self, url: &Url) { + if let Ok(path) = url.to_file_path() { + // TODO(jane): report error here + let _ = self.watcher.watch(&path, notify::RecursiveMode::Recursive); + } + } + + fn stop_tracking_url(&mut self, url: &Url) { + if let Ok(path) = url.to_file_path() { + // TODO(jane): report error here + let _ = self.watcher.unwatch(&path); + } + } +} + +impl OpenDocuments { + fn doc_snapshot(&self, url: &Url) -> Option { + Some(self.documents.get(url)?.make_ref()) + } + fn doc_controller(&mut self, url: &Url) -> Option<&mut DocumentController> { + self.documents.get_mut(url) + } + fn open_document(&mut self, url: &Url, contents: String, version: i32) { + if self + .documents + .insert(url.clone(), DocumentController::new(contents, version)) + .is_some() + { + tracing::warn!("Opening document `{url}` that is already open!"); + } + } + fn close_document(&mut self, url: &Url) -> crate::Result<()> { + let Some(_) = self.documents.remove(url) else { + return Err(anyhow!( + "Tried to close document `{url}`, which was not open" + )); + }; + Ok(()) + } +} + +impl DocumentController { + fn new(contents: String, version: i32) -> Self { + Self { + document: Arc::new(Document::new(contents, version)), + } + } + fn make_ref(&self) -> DocumentRef { + DocumentRef { + document: self.document.clone(), + } + } +} + +impl Deref for DocumentController { + type Target = Document; + fn deref(&self) -> &Self::Target { + &self.document + } +} + +impl DerefMut for DocumentController { + fn deref_mut(&mut self) -> &mut Self::Target { + Arc::make_mut(&mut self.document) + } +} + +impl Deref for DocumentRef { + type Target = Document; + fn deref(&self) -> &Self::Target { + &self.document + } +} + +impl SessionSnapshot { + pub(crate) fn configuration(&self) -> &Configuration { + &self.configuration + } + + pub(crate) fn document(&self) -> &DocumentRef { + &self.document_ref + } + + pub(crate) fn encoding(&self) -> PositionEncoding { + self.position_encoding + } + + pub(crate) fn url(&self) -> &Url { + &self.url + } +} + +impl Workspaces { + fn new(urls: &[Url]) -> crate::Result { + Ok(Self( + urls.iter() + .map(Workspace::new) + .collect::>()?, + )) + } + + fn update_configuration_files(&mut self, event: ¬ify::Event) -> bool { + for path in &event.paths { + if !matches!( + path.file_name().and_then(|name| name.to_str()), + Some("ruff.toml" | "pyproject.toml") + ) { + continue; + } + if let Some((workspace_path, workspace)) = self.mut_entry_for_path(path) { + workspace.reload_configuration(&workspace_path); + return true; + } + } + false + } + + fn open_workspace_folder(&mut self, folder_url: &Url) -> crate::Result<()> { + let (path, workspace) = Workspace::new(folder_url)?; + self.0.insert(path, workspace); + Ok(()) + } + + fn close_workspace_folder(&mut self, folder_url: &Url) -> crate::Result<()> { + let path = folder_url + .to_file_path() + .map_err(|()| anyhow!("Folder URI was not a proper file path"))?; + self.0 + .remove(&path) + .ok_or_else(|| anyhow!("Tried to remove non-existent folder {}", path.display()))?; + Ok(()) + } + + fn doc_snapshot(&self, document_url: &Url) -> Option { + self.workspace_for_url(document_url) + .and_then(|w| w.open_documents.doc_snapshot(document_url)) + } + + fn doc_controller(&mut self, document_url: &Url) -> Option<&mut DocumentController> { + self.mut_workspace_for_url(document_url) + .and_then(|w| w.open_documents.doc_controller(document_url)) + } + + fn configuration(&self, document_url: &Url) -> Option<&Arc> { + self.workspace_for_url(document_url) + .map(|w| &w.configuration) + } + + fn open_document(&mut self, url: &Url, contents: String, version: i32) { + if let Some(w) = self.mut_workspace_for_url(url) { + w.open_documents.open_document(url, contents, version); + } + } + + fn close_document(&mut self, url: &Url) -> crate::Result<()> { + self.mut_workspace_for_url(url) + .ok_or_else(|| anyhow!("Workspace not found for {url}"))? + .open_documents + .close_document(url) + } + + fn workspace_for_url(&self, url: &Url) -> Option<&Workspace> { + let path = url.to_file_path().ok()?; + self.0 + .keys() + .filter(|p| path.starts_with(p)) + .max_by_key(|p| p.as_os_str().len()) + .and_then(|u| self.0.get(u)) + } + + fn mut_workspace_for_url(&mut self, url: &Url) -> Option<&mut Workspace> { + let path = url.to_file_path().ok()?; + self.0 + .keys() + .filter(|p| path.starts_with(p)) + .max_by_key(|p| p.as_os_str().len()) + .cloned() + .and_then(|u| self.0.get_mut(&u)) + } + + fn mut_entry_for_path(&mut self, path: &Path) -> Option<(PathBuf, &mut Workspace)> { + self.0 + .keys() + .filter(|p| path.starts_with(p)) + .max_by_key(|p| p.as_os_str().len()) + .cloned() + .and_then(|u| { + let workspace = self.0.get_mut(&u)?; + Some((u, workspace)) + }) + } +} + +impl Workspace { + pub(crate) fn new(root: &Url) -> crate::Result<(PathBuf, Self)> { + let path = root + .to_file_path() + .map_err(|()| anyhow!("workspace URL was not a file path!"))?; + // Fall-back to default configuration + let configuration = Self::find_configuration_or_fallback(&path); + + Ok(( + path, + Self { + open_documents: OpenDocuments::default(), + configuration: Arc::new(configuration), + }, + )) + } + + fn reload_configuration(&mut self, path: &Path) { + self.configuration = Arc::new(Self::find_configuration_or_fallback(path)); + } + + fn find_configuration_or_fallback(root: &Path) -> Configuration { + find_configuration_from_root(root).unwrap_or_else(|err| { + tracing::error!("The following error occured when trying to find a configuration file at `{}`:\n{err}", root.display()); + tracing::error!("Falling back to default configuration for `{}`", root.display()); + Configuration::default() + }) + } +} + +pub(crate) fn find_configuration_from_root(root: &Path) -> crate::Result { + let pyproject = ruff_workspace::pyproject::find_settings_toml(root)? + .ok_or_else(|| anyhow!("No pyproject.toml/ruff.toml file was found"))?; + let settings = ruff_workspace::resolver::resolve_root_settings( + &pyproject, + Relativity::Parent, + &LSPConfigTransformer, + )?; + Ok(Configuration { + linter: settings.linter, + formatter: settings.formatter, + }) +} + +struct LSPConfigTransformer; + +impl ConfigurationTransformer for LSPConfigTransformer { + fn transform( + &self, + config: ruff_workspace::configuration::Configuration, + ) -> ruff_workspace::configuration::Configuration { + config + } +} diff --git a/crates/ruff_server/src/session/types.rs b/crates/ruff_server/src/session/types.rs new file mode 100644 index 00000000000000..14fedc6ed7e49c --- /dev/null +++ b/crates/ruff_server/src/session/types.rs @@ -0,0 +1,35 @@ +#![allow(dead_code)] // TODO(jane): get this wired up after the pre-release + +#[derive(Debug, Clone, PartialEq, Eq)] +enum WhenToRun { + OnType, + OnSave, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +enum LogLevel { + #[default] + Error, + Warning, + Info, + Debug, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub(crate) struct Settings { + log_level: LogLevel, + code_action: CodeActionSettings, + organize_imports: bool, + fix_all: bool, + linter: LinterSettings, + formatter: FormatterSettings, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub(crate) struct CodeActionSettings; + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub(crate) struct LinterSettings; + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub(crate) struct FormatterSettings; diff --git a/crates/ruff_server/tests/document.rs b/crates/ruff_server/tests/document.rs new file mode 100644 index 00000000000000..dd8dc374d0f14f --- /dev/null +++ b/crates/ruff_server/tests/document.rs @@ -0,0 +1,91 @@ +const PANDAS_HTML_SRC: &str = include_str!("../resources/test/fixtures/pandas_html.py"); + +use lsp_types::{Position, Range, TextDocumentContentChangeEvent}; +use ruff_server::{Document, PositionEncoding}; + +#[test] +fn delete_lines_pandas_html() { + let mut document = Document::new(PANDAS_HTML_SRC.to_string(), 1); + + let changes = vec![ + TextDocumentContentChangeEvent { + range: Some(Range { + start: Position { + line: 79, + character: 0, + }, + end: Position { + line: 91, + character: 67, + }, + }), + range_length: Some(388), + text: "".into(), + }, + TextDocumentContentChangeEvent { + range: Some(Range { + start: Position { + line: 81, + character: 4, + }, + end: Position { + line: 81, + character: 36, + }, + }), + range_length: Some(32), + text: "p".into(), + }, + TextDocumentContentChangeEvent { + range: Some(Range { + start: Position { + line: 81, + character: 5, + }, + end: Position { + line: 81, + character: 5, + }, + }), + range_length: Some(0), + text: "a".into(), + }, + TextDocumentContentChangeEvent { + range: Some(Range { + start: Position { + line: 81, + character: 6, + }, + end: Position { + line: 81, + character: 6, + }, + }), + range_length: Some(0), + text: "s".into(), + }, + TextDocumentContentChangeEvent { + range: Some(Range { + start: Position { + line: 81, + character: 7, + }, + end: Position { + line: 81, + character: 7, + }, + }), + range_length: Some(0), + text: "s".into(), + }, + ]; + + let mut version = 2; + + for change in changes { + document.apply_changes(vec![change], version, PositionEncoding::UTF16); + version += 1; + } + + insta::assert_snapshot!(document.contents()); +} diff --git a/crates/ruff_server/tests/snapshots/document__delete_lines_pandas_html.snap b/crates/ruff_server/tests/snapshots/document__delete_lines_pandas_html.snap new file mode 100644 index 00000000000000..2ba81d2007ecea --- /dev/null +++ b/crates/ruff_server/tests/snapshots/document__delete_lines_pandas_html.snap @@ -0,0 +1,1233 @@ +--- +source: crates/ruff_server/tests/document.rs +expression: document.contents() +--- +# +------------------------------------------------------------+ +# | Code adopted from: | +# | Repository: https://github.com/pandas-dev/pandas.git | +# | File: `io/html.py` | +# | Commit: 1f622e2b5303650fa5e497e4552d0554e51049cb | +# +------------------------------------------------------------+ +# This file should be used to test LSP functions that edit / fix a file. + +""" +:mod:`pandas.io.html` is a module containing functionality for dealing with +HTML IO. + +""" + +from __future__ import annotations + +from collections import abc +import errno +import numbers +import os +import re +from re import Pattern +from typing import ( + TYPE_CHECKING, + Literal, + cast, +) + +from pandas._libs import lib +from pandas.compat._optional import import_optional_dependency +from pandas.errors import ( + AbstractMethodError, + EmptyDataError, +) +from pandas.util._decorators import doc +from pandas.util._validators import check_dtype_backend + +from pandas.core.dtypes.common import is_list_like + +from pandas import isna +from pandas.core.indexes.base import Index +from pandas.core.indexes.multi import MultiIndex +from pandas.core.series import Series +from pandas.core.shared_docs import _shared_docs + +from pandas.io.common import ( + get_handle, + is_url, + stringify_path, + validate_header_arg, +) +from pandas.io.formats.printing import pprint_thing +from pandas.io.parsers import TextParser + +if TYPE_CHECKING: + from collections.abc import ( + Iterable, + Sequence, + ) + + from pandas._typing import ( + BaseBuffer, + DtypeBackend, + FilePath, + HTMLFlavors, + ReadBuffer, + StorageOptions, + ) + + from pandas import DataFrame + +############# +# READ HTML # +############# +_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}") + + +def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str: + """ + + """ + pass + + +def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]: + """ + Get an iterator given an integer, slice or container. + + Parameters + ---------- + skiprows : int, slice, container + The iterator to use to skip rows; can also be a slice. + + Raises + ------ + TypeError + * If `skiprows` is not a slice, integer, or Container + + Returns + ------- + it : iterable + A proper iterator to use to skip rows of a DataFrame. + """ + if isinstance(skiprows, slice): + start, step = skiprows.start or 0, skiprows.step or 1 + return list(range(start, skiprows.stop, step)) + elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): + return cast("int | Sequence[int]", skiprows) + elif skiprows is None: + return 0 + raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows") + + +def _read( + obj: FilePath | BaseBuffer, + encoding: str | None, + storage_options: StorageOptions | None, +) -> str | bytes: + """ + Try to read from a url, file or string. + + Parameters + ---------- + obj : str, unicode, path object, or file-like object + + Returns + ------- + raw_text : str + """ + try: + with get_handle( + obj, "r", encoding=encoding, storage_options=storage_options + ) as handles: + return handles.handle.read() + except OSError as err: + if not is_url(obj): + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {obj}" + ) from err + raise + + +class _HtmlFrameParser: + """ + Base class for parsers that parse HTML into DataFrames. + + Parameters + ---------- + io : str or file-like + This can be either a string path, a valid URL using the HTTP, + FTP, or FILE protocols or a file-like object. + + match : str or regex + The text to match in the document. + + attrs : dict + List of HTML element attributes to match. + + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + Attributes + ---------- + io : str or file-like + raw HTML, URL, or file-like object + + match : regex + The text to match in the raw HTML + + attrs : dict-like + A dictionary of valid table attributes to use to search for table + elements. + + encoding : str + Encoding to be used by parser + + displayed_only : bool + Whether or not items with "display:none" should be ignored + + extract_links : {None, "all", "header", "body", "footer"} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + Notes + ----- + To subclass this class effectively you must override the following methods: + * :func:`_build_doc` + * :func:`_attr_getter` + * :func:`_href_getter` + * :func:`_text_getter` + * :func:`_parse_td` + * :func:`_parse_thead_tr` + * :func:`_parse_tbody_tr` + * :func:`_parse_tfoot_tr` + * :func:`_parse_tables` + * :func:`_equals_tag` + See each method's respective documentation for details on their + functionality. + """ + + def __init__( + self, + io: FilePath | ReadBuffer[str] | ReadBuffer[bytes], + match: str | Pattern, + attrs: dict[str, str] | None, + encoding: str, + displayed_only: bool, + extract_links: Literal[None, "header", "footer", "body", "all"], + storage_options: StorageOptions = None, + ) -> None: + self.io = io + self.match = match + self.attrs = attrs + self.encoding = encoding + self.displayed_only = displayed_only + self.extract_links = extract_links + self.storage_options = storage_options + + def parse_tables(self): + """ + Parse and return all tables from the DOM. + + Returns + ------- + list of parsed (header, body, footer) tuples from tables. + """ + tables = self._parse_tables(self._build_doc(), self.match, self.attrs) + return (self._parse_thead_tbody_tfoot(table) for table in tables) + + def _attr_getter(self, obj, attr): + """ + Return the attribute value of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + attr : str or unicode + The attribute, such as "colspan" + + Returns + ------- + str or unicode + The attribute value. + """ + # Both lxml and BeautifulSoup have the same implementation: + return obj.get(attr) + + def _href_getter(self, obj) -> str | None: + """ + Return a href if the DOM node contains a child or None. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + href : str or unicode + The href from the child of the DOM node. + """ + raise AbstractMethodError(self) + + def _text_getter(self, obj): + """ + Return the text of an individual DOM node. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + text : str or unicode + The text from an individual DOM node. + """ + raise AbstractMethodError(self) + + def _parse_td(self, obj): + """ + Return the td elements from a row element. + + Parameters + ---------- + obj : node-like + A DOM node. + + Returns + ------- + list of node-like + These are the elements of each row, i.e., the columns. + """ + raise AbstractMethodError(self) + + def _parse_thead_tr(self, table): + """ + Return the list of thead row elements from the parsed table element. + + Parameters + ---------- + table : a table element that contains zero or more thead elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tbody_tr(self, table): + """ + Return the list of tbody row elements from the parsed table element. + + HTML5 table bodies consist of either 0 or more elements (which + only contain elements) or 0 or more elements. This method + checks for both structures. + + Parameters + ---------- + table : a table element that contains row elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tfoot_tr(self, table): + """ + Return the list of tfoot row elements from the parsed table element. + + Parameters + ---------- + table : a table element that contains row elements. + + Returns + ------- + list of node-like + These are the row elements of a table. + """ + raise AbstractMethodError(self) + + def _parse_tables(self, document, match, attrs): + """ + Return all tables from the parsed DOM. + + Parameters + ---------- + document : the DOM from which to parse the table element. + + match : str or regular expression + The text to search for in the DOM tree. + + attrs : dict + A dictionary of table attributes that can be used to disambiguate + multiple tables on a page. + + Raises + ------ + ValueError : `match` does not match any text in the document. + + Returns + ------- + list of node-like + HTML
elements to be parsed into raw data. + """ + raise AbstractMethodError(self) + + def _equals_tag(self, obj, tag) -> bool: + """ + Return whether an individual DOM node matches a tag + + Parameters + ---------- + obj : node-like + A DOM node. + + tag : str + Tag name to be checked for equality. + + Returns + ------- + boolean + Whether `obj`'s tag name is `tag` + """ + raise AbstractMethodError(self) + + def _build_doc(self): + """ + Return a tree-like object that can be used to iterate over the DOM. + + Returns + ------- + node-like + The DOM from which to parse the table element. + """ + raise AbstractMethodError(self) + + def _parse_thead_tbody_tfoot(self, table_html): + """ + Given a table, return parsed header, body, and foot. + + Parameters + ---------- + table_html : node-like + + Returns + ------- + tuple of (header, body, footer), each a list of list-of-text rows. + + Notes + ----- + Header and body are lists-of-lists. Top level list is a list of + rows. Each row is a list of str text. + + Logic: Use , , elements to identify + header, body, and footer, otherwise: + - Put all rows into body + - Move rows from top of body to header only if + all elements inside row are . Move the top all- or + while body_rows and row_is_all_th(body_rows[0]): + header_rows.append(body_rows.pop(0)) + + header = self._expand_colspan_rowspan(header_rows, section="header") + body = self._expand_colspan_rowspan(body_rows, section="body") + footer = self._expand_colspan_rowspan(footer_rows, section="footer") + + return header, body, footer + + def _expand_colspan_rowspan( + self, rows, section: Literal["header", "footer", "body"] + ) -> list[list]: + """ + Given a list of s, return a list of text rows. + + Parameters + ---------- + rows : list of node-like + List of s + section : the section that the rows belong to (header, body or footer). + + Returns + ------- + list of list + Each returned row is a list of str text, or tuple (text, link) + if extract_links is not None. + + Notes + ----- + Any cell with ``rowspan`` or ``colspan`` will have its contents copied + to subsequent cells. + """ + all_texts = [] # list of rows, each a list of str + text: str | tuple + remainder: list[ + tuple[int, str | tuple, int] + ] = [] # list of (index, text, nrows) + + for tr in rows: + texts = [] # the output for this row + next_remainder = [] + + index = 0 + tds = self._parse_td(tr) + for td in tds: + # Append texts from previous rows with rowspan>1 that come + # before this or (see _parse_thead_tr). + return row.xpath("./td|./th") + + def _parse_tables(self, document, match, kwargs): + pattern = match.pattern + + # 1. check all descendants for the given pattern and only search tables + # GH 49929 + xpath_expr = f"//table[.//text()[re:test(., {pattern!r})]]" + + # if any table attributes were given build an xpath expression to + # search for them + if kwargs: + xpath_expr += _build_xpath_expr(kwargs) + + tables = document.xpath(xpath_expr, namespaces=_re_namespace) + + tables = self._handle_hidden_tables(tables, "attrib") + if self.displayed_only: + for table in tables: + # lxml utilizes XPATH 1.0 which does not have regex + # support. As a result, we find all elements with a style + # attribute and iterate them to check for display:none + for elem in table.xpath(".//style"): + elem.drop_tree() + for elem in table.xpath(".//*[@style]"): + if "display:none" in elem.attrib.get("style", "").replace(" ", ""): + elem.drop_tree() + if not tables: + raise ValueError(f"No tables found matching regex {pattern!r}") + return tables + + def _equals_tag(self, obj, tag) -> bool: + return obj.tag == tag + + def _build_doc(self): + """ + Raises + ------ + ValueError + * If a URL that lxml cannot parse is passed. + + Exception + * Any other ``Exception`` thrown. For example, trying to parse a + URL that is syntactically correct on a machine with no internet + connection will fail. + + See Also + -------- + pandas.io.html._HtmlFrameParser._build_doc + """ + from lxml.etree import XMLSyntaxError + from lxml.html import ( + HTMLParser, + parse, + ) + + parser = HTMLParser(recover=True, encoding=self.encoding) + + if is_url(self.io): + with get_handle(self.io, "r", storage_options=self.storage_options) as f: + r = parse(f.handle, parser=parser) + else: + # try to parse the input in the simplest way + try: + r = parse(self.io, parser=parser) + except OSError as err: + raise FileNotFoundError( + f"[Errno {errno.ENOENT}] {os.strerror(errno.ENOENT)}: {self.io}" + ) from err + try: + r = r.getroot() + except AttributeError: + pass + else: + if not hasattr(r, "text_content"): + raise XMLSyntaxError("no text parsed from document", 0, 0, 0) + + for br in r.xpath("*//br"): + br.tail = "\n" + (br.tail or "") + + return r + + def _parse_thead_tr(self, table): + rows = [] + + for thead in table.xpath(".//thead"): + rows.extend(thead.xpath("./tr")) + + # HACK: lxml does not clean up the clearly-erroneous + # . (Missing ). Add + # the and _pretend_ it's a ; _parse_td() will find its + # children as though it's a . + # + # Better solution would be to use html5lib. + elements_at_root = thead.xpath("./td|./th") + if elements_at_root: + rows.append(thead) + + return rows + + def _parse_tbody_tr(self, table): + from_tbody = table.xpath(".//tbody//tr") + from_root = table.xpath("./tr") + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.xpath(".//tfoot//tr") + + +def _expand_elements(body) -> None: + data = [len(elem) for elem in body] + lens = Series(data) + lens_max = lens.max() + not_max = lens[lens != lens_max] + + empty = [""] + for ind, length in not_max.items(): + body[ind] += empty * (lens_max - length) + + +def _data_to_frame(**kwargs): + head, body, foot = kwargs.pop("data") + header = kwargs.pop("header") + kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"]) + if head: + body = head + body + + # Infer header when there is a or top
+ - Move rows from bottom of body to footer only if + all elements inside row are + """ + header_rows = self._parse_thead_tr(table_html) + body_rows = self._parse_tbody_tr(table_html) + footer_rows = self._parse_tfoot_tr(table_html) + + def row_is_all_th(row): + return all(self._equals_tag(t, "th") for t in self._parse_td(row)) + + if not header_rows: + # The table has no
rows from + # body_rows to header_rows. (This is a common case because many + # tables in the wild have no
+ while remainder and remainder[0][0] <= index: + prev_i, prev_text, prev_rowspan = remainder.pop(0) + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + index += 1 + + # Append the text from this , colspan times + text = _remove_whitespace(self._text_getter(td)) + if self.extract_links in ("all", section): + href = self._href_getter(td) + text = (text, href) + rowspan = int(self._attr_getter(td, "rowspan") or 1) + colspan = int(self._attr_getter(td, "colspan") or 1) + + for _ in range(colspan): + texts.append(text) + if rowspan > 1: + next_remainder.append((index, text, rowspan - 1)) + index += 1 + + # Append texts from previous rows at the final position + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + + all_texts.append(texts) + remainder = next_remainder + + # Append rows that only appear because the previous row had non-1 + # rowspan + while remainder: + next_remainder = [] + texts = [] + for prev_i, prev_text, prev_rowspan in remainder: + texts.append(prev_text) + if prev_rowspan > 1: + next_remainder.append((prev_i, prev_text, prev_rowspan - 1)) + all_texts.append(texts) + remainder = next_remainder + + return all_texts + + def _handle_hidden_tables(self, tbl_list, attr_name: str): + """ + Return list of tables, potentially removing hidden elements + + Parameters + ---------- + tbl_list : list of node-like + Type of list elements will vary depending upon parser used + attr_name : str + Name of the accessor for retrieving HTML attributes + + Returns + ------- + list of node-like + Return type matches `tbl_list` + """ + if not self.displayed_only: + return tbl_list + + return [ + x + for x in tbl_list + if "display:none" + not in getattr(x, attr_name).get("style", "").replace(" ", "") + ] + + +class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): + """ + HTML to DataFrame parser that uses BeautifulSoup under the hood. + + See Also + -------- + pandas.io.html._HtmlFrameParser + pandas.io.html._LxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`pandas.io.html._HtmlFrameParser`. + """ + + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + from bs4 import SoupStrainer + + self._strainer = SoupStrainer("table") + + def _parse_tables(self, document, match, attrs): + element_name = self._strainer.name + tables = document.find_all(element_name, attrs=attrs) + if not tables: + raise ValueError("No tables found") + + result = [] + unique_tables = set() + tables = self._handle_hidden_tables(tables, "attrs") + + for table in tables: + if self.displayed_only: + for elem in table.find_all("style"): + elem.decompose() + + for elem in table.find_all(style=re.compile(r"display:\s*none")): + elem.decompose() + + if table not in unique_tables and table.find(string=match) is not None: + result.append(table) + unique_tables.add(table) + if not result: + raise ValueError(f"No tables found matching pattern {match.pattern!r}") + return result + + def _href_getter(self, obj) -> str | None: + a = obj.find("a", href=True) + return None if not a else a["href"] + + def _text_getter(self, obj): + return obj.text + + def _equals_tag(self, obj, tag) -> bool: + return obj.name == tag + + def _parse_td(self, row): + return row.find_all(("td", "th"), recursive=False) + + def _parse_thead_tr(self, table): + return table.select("thead tr") + + def _parse_tbody_tr(self, table): + from_tbody = table.select("tbody tr") + from_root = table.find_all("tr", recursive=False) + # HTML spec: at most one of these lists has content + return from_tbody + from_root + + def _parse_tfoot_tr(self, table): + return table.select("tfoot tr") + + def _setup_build_doc(self): + raw_text = _read(self.io, self.encoding, self.storage_options) + if not raw_text: + raise ValueError(f"No text parsed from document: {self.io}") + return raw_text + + def _build_doc(self): + from bs4 import BeautifulSoup + + bdoc = self._setup_build_doc() + if isinstance(bdoc, bytes) and self.encoding is not None: + udoc = bdoc.decode(self.encoding) + from_encoding = None + else: + udoc = bdoc + from_encoding = self.encoding + + soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding) + + for br in soup.find_all("br"): + br.replace_with("\n" + br.text) + + return soup + + +def _build_xpath_expr(attrs) -> str: + """ + Build an xpath expression to simulate bs4's ability to pass in kwargs to + search for attributes when using the lxml parser. + + Parameters + ---------- + attrs : dict + A dict of HTML attributes. These are NOT checked for validity. + + Returns + ------- + expr : unicode + An XPath expression that checks for the given HTML attributes. + """ + # give class attribute as class_ because class is a python keyword + if "class_" in attrs: + attrs["class"] = attrs.pop("class_") + + s = " and ".join([f"@{k}={v!r}" for k, v in attrs.items()]) + return f"[{s}]" + + +_re_namespace = {"re": "http://exslt.org/regular-expressions"} + + +class _LxmlFrameParser(_HtmlFrameParser): + """ + HTML to DataFrame parser that uses lxml under the hood. + + Warning + ------- + This parser can only handle HTTP, FTP, and FILE urls. + + See Also + -------- + _HtmlFrameParser + _BeautifulSoupLxmlFrameParser + + Notes + ----- + Documentation strings for this class are in the base class + :class:`_HtmlFrameParser`. + """ + + def _href_getter(self, obj) -> str | None: + href = obj.xpath(".//a/@href") + return None if not href else href[0] + + def _text_getter(self, obj): + return obj.text_content() + + def _parse_td(self, row): + # Look for direct children only: the "row" element here may be a + #
foobar
-only rows + if header is None: + if len(head) == 1: + header = 0 + else: + # ignore all-empty-text rows + header = [i for i, row in enumerate(head) if any(text for text in row)] + + if foot: + body += foot + + # fill out elements of body that are "ragged" + _expand_elements(body) + with TextParser(body, header=header, **kwargs) as tp: + return tp.read() + + +_valid_parsers = { + "lxml": _LxmlFrameParser, + None: _LxmlFrameParser, + "html5lib": _BeautifulSoupHtml5LibFrameParser, + "bs4": _BeautifulSoupHtml5LibFrameParser, +} + + +def _parser_dispatch(flavor: HTMLFlavors | None) -> type[_HtmlFrameParser]: + """ + Choose the parser based on the input flavor. + + Parameters + ---------- + flavor : {{"lxml", "html5lib", "bs4"}} or None + The type of parser to use. This must be a valid backend. + + Returns + ------- + cls : _HtmlFrameParser subclass + The parser class based on the requested input flavor. + + Raises + ------ + ValueError + * If `flavor` is not a valid backend. + ImportError + * If you do not have the requested `flavor` + """ + valid_parsers = list(_valid_parsers.keys()) + if flavor not in valid_parsers: + raise ValueError( + f"{flavor!r} is not a valid flavor, valid flavors are {valid_parsers}" + ) + + if flavor in ("bs4", "html5lib"): + import_optional_dependency("html5lib") + import_optional_dependency("bs4") + else: + import_optional_dependency("lxml.etree") + return _valid_parsers[flavor] + + +def _print_as_set(s) -> str: + arg = ", ".join([pprint_thing(el) for el in s]) + return f"{{{arg}}}" + + +def _validate_flavor(flavor): + if flavor is None: + flavor = "lxml", "bs4" + elif isinstance(flavor, str): + flavor = (flavor,) + elif isinstance(flavor, abc.Iterable): + if not all(isinstance(flav, str) for flav in flavor): + raise TypeError( + f"Object of type {type(flavor).__name__!r} " + f"is not an iterable of strings" + ) + else: + msg = repr(flavor) if isinstance(flavor, str) else str(flavor) + msg += " is not a valid flavor" + raise ValueError(msg) + + flavor = tuple(flavor) + valid_flavors = set(_valid_parsers) + flavor_set = set(flavor) + + if not flavor_set & valid_flavors: + raise ValueError( + f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid " + f"flavors are {_print_as_set(valid_flavors)}" + ) + return flavor + + +def _parse( + flavor, + io, + match, + attrs, + encoding, + displayed_only, + extract_links, + storage_options, + **kwargs, +): + flavor = _validate_flavor(flavor) + compiled_match = re.compile(match) # you can pass a compiled regex here + + retained = None + for flav in flavor: + parser = _parser_dispatch(flav) + p = parser( + io, + compiled_match, + attrs, + encoding, + displayed_only, + extract_links, + storage_options, + ) + + try: + tables = p.parse_tables() + except ValueError as caught: + # if `io` is an io-like object, check if it's seekable + # and try to rewind it before trying the next parser + if hasattr(io, "seekable") and io.seekable(): + io.seek(0) + elif hasattr(io, "seekable") and not io.seekable(): + # if we couldn't rewind it, let the user know + raise ValueError( + f"The flavor {flav} failed to parse your input. " + "Since you passed a non-rewindable file " + "object, we can't rewind it to try " + "another parser. Try read_html() with a different flavor." + ) from caught + + retained = caught + else: + break + else: + assert retained is not None # for mypy + raise retained + + ret = [] + for table in tables: + try: + df = _data_to_frame(data=table, **kwargs) + # Cast MultiIndex header to an Index of tuples when extracting header + # links and replace nan with None (therefore can't use mi.to_flat_index()). + # This maintains consistency of selection (e.g. df.columns.str[1]) + if extract_links in ("all", "header") and isinstance( + df.columns, MultiIndex + ): + df.columns = Index( + ((col[0], None if isna(col[1]) else col[1]) for col in df.columns), + tupleize_cols=False, + ) + + ret.append(df) + except EmptyDataError: # empty table + continue + return ret + + +@doc(storage_options=_shared_docs["storage_options"]) +def read_html( + io: FilePath | ReadBuffer[str], + *, + match: str | Pattern = ".+", + flavor: HTMLFlavors | Sequence[HTMLFlavors] | None = None, + header: int | Sequence[int] | None = None, + index_col: int | Sequence[int] | None = None, + skiprows: int | Sequence[int] | slice | None = None, + attrs: dict[str, str] | None = None, + parse_dates: bool = False, + thousands: str | None = ",", + encoding: str | None = None, + decimal: str = ".", + converters: dict | None = None, + na_values: Iterable[object] | None = None, + keep_default_na: bool = True, + displayed_only: bool = True, + extract_links: Literal[None, "header", "footer", "body", "all"] = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, + storage_options: StorageOptions = None, +) -> list[DataFrame]: + r""" + Read HTML tables into a ``list`` of ``DataFrame`` objects. + + Parameters + ---------- + io : str, path object, or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a string ``read()`` function. + The string can represent a URL. Note that + lxml only accepts the http, ftp and file url protocols. If you have a + URL that starts with ``'https'`` you might try removing the ``'s'``. + + .. deprecated:: 2.1.0 + Passing html literal strings is deprecated. + Wrap literal string/bytes input in ``io.StringIO``/``io.BytesIO`` instead. + + match : str or compiled regular expression, optional + The set of tables containing text matching this regex or string will be + returned. Unless the HTML is extremely simple you will probably need to + pass a non-empty string here. Defaults to '.+' (match any non-empty + string). The default value will return all tables contained on a page. + This value is converted to a regular expression so that there is + consistent behavior between Beautiful Soup and lxml. + + flavor : {{"lxml", "html5lib", "bs4"}} or list-like, optional + The parsing engine (or list of parsing engines) to use. 'bs4' and + 'html5lib' are synonymous with each other, they are both there for + backwards compatibility. The default of ``None`` tries to use ``lxml`` + to parse and if that fails it falls back on ``bs4`` + ``html5lib``. + + header : int or list-like, optional + The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to + make the columns headers. + + index_col : int or list-like, optional + The column (or list of columns) to use to create the index. + + skiprows : int, list-like or slice, optional + Number of rows to skip after parsing the column integer. 0-based. If a + sequence of integers or a slice is given, will skip the rows indexed by + that sequence. Note that a single element sequence means 'skip the nth + row' whereas an integer means 'skip n rows'. + + attrs : dict, optional + This is a dictionary of attributes that you can pass to use to identify + the table in the HTML. These are not checked for validity before being + passed to lxml or Beautiful Soup. However, these attributes must be + valid HTML table attributes to work correctly. For example, :: + + attrs = {{"id": "table"}} + + is a valid attribute dictionary because the 'id' HTML tag attribute is + a valid HTML attribute for *any* HTML tag as per `this document + `__. :: + + attrs = {{"asdf": "table"}} + + is *not* a valid attribute dictionary because 'asdf' is not a valid + HTML attribute even if it is a valid XML attribute. Valid HTML 4.01 + table attributes can be found `here + `__. A + working draft of the HTML 5 spec can be found `here + `__. It contains the + latest information on table attributes for the modern web. + + parse_dates : bool, optional + See :func:`~read_csv` for more details. + + thousands : str, optional + Separator to use to parse thousands. Defaults to ``','``. + + encoding : str, optional + The encoding used to decode the web page. Defaults to ``None``.``None`` + preserves the previous encoding behavior, which depends on the + underlying parser library (e.g., the parser library will try to use + the encoding provided by the document). + + decimal : str, default '.' + Character to recognize as decimal point (e.g. use ',' for European + data). + + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. + + na_values : iterable, default None + Custom NA values. + + keep_default_na : bool, default True + If na_values are specified and keep_default_na is False the default NaN + values are overridden, otherwise they're appended to. + + displayed_only : bool, default True + Whether elements with "display: none" should be parsed. + + extract_links : {{None, "all", "header", "body", "footer"}} + Table elements in the specified section(s) with tags will have their + href extracted. + + .. versionadded:: 1.5.0 + + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + .. versionadded:: 2.0 + + {storage_options} + + .. versionadded:: 2.1.0 + + Returns + ------- + dfs + A list of DataFrames. + + See Also + -------- + read_csv : Read a comma-separated values (csv) file into DataFrame. + + Notes + ----- + Before using this function you should read the :ref:`gotchas about the + HTML parsing libraries `. + + Expect to do some cleanup after you call this function. For example, you + might need to manually assign column names if the column names are + converted to NaN when you pass the `header=0` argument. We try to assume as + little as possible about the structure of the table and push the + idiosyncrasies of the HTML contained in the table to the user. + + This function searches for ```` elements and only for ```` + and ```` or ```` argument, it is used to construct + the header, otherwise the function attempts to find the header within + the body (by putting rows with only ``
`` rows and ```` elements within each ``
`` + element in the table. ```` stands for "table data". This function + attempts to properly handle ``colspan`` and ``rowspan`` attributes. + If the function has a ``
`` elements into the header). + + Similar to :func:`~read_csv` the `header` argument is applied + **after** `skiprows` is applied. + + This function will *always* return a list of :class:`DataFrame` *or* + it will fail, e.g., it will *not* return an empty list. + + Examples + -------- + See the :ref:`read_html documentation in the IO section of the docs + ` for some examples of reading in HTML tables. + """ + # Type check here. We don't want to parse only to fail because of an + # invalid value of an integer skiprows. + if isinstance(skiprows, numbers.Integral) and skiprows < 0: + raise ValueError( + "cannot skip rows starting from the end of the " + "data (you passed a negative value)" + ) + if extract_links not in [None, "header", "footer", "body", "all"]: + raise ValueError( + "`extract_links` must be one of " + '{None, "header", "footer", "body", "all"}, got ' + f'"{extract_links}"' + ) + + validate_header_arg(header) + check_dtype_backend(dtype_backend) + + io = stringify_path(io) + + return _parse( + flavor=flavor, + io=io, + match=match, + header=header, + index_col=index_col, + skiprows=skiprows, + parse_dates=parse_dates, + thousands=thousands, + attrs=attrs, + encoding=encoding, + decimal=decimal, + converters=converters, + na_values=na_values, + keep_default_na=keep_default_na, + displayed_only=displayed_only, + extract_links=extract_links, + dtype_backend=dtype_backend, + storage_options=storage_options, + ) + diff --git a/crates/ruff_source_file/src/line_index.rs b/crates/ruff_source_file/src/line_index.rs index 31db33eb84f8c7..7f9022fff41489 100644 --- a/crates/ruff_source_file/src/line_index.rs +++ b/crates/ruff_source_file/src/line_index.rs @@ -129,6 +129,11 @@ impl LineIndex { self.line_starts().len() } + /// Returns `true` if the text only consists of ASCII characters + pub fn is_ascii(&self) -> bool { + self.kind().is_ascii() + } + /// Returns the row number for a given offset. /// /// ## Examples