diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 000000000000..7c66296774e2 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,54 @@ +## Purpose +Short, actionable notes to help AI coding agents be productive in this repository (microsoft/edit). + +## Big picture (what to change and where) +- Single crate Rust editor. Core library code lives in `src/` and the CLI/UI binary is `src/bin/edit/`. +- UI loop and terminal handling are centered in `src/bin/edit/main.rs` (vt parser, input parser, TUI render loop). +- Platform abstractions and low-level IO live in `src/sys/` (`unix.rs`, `windows.rs`). Prefer changes here for platform behavior. +- Memory allocation / temporary buffers use the project's arena allocator in `src/arena/` (look for `Arena`, `ArenaString`, `scratch_arena`). +- Internationalization is generated at build time from `i18n/edit.toml` by the build script `build/main.rs` and included via `include!(concat!(env!("OUT_DIR"), "/i18n_edit.rs"));` in `src/bin/edit/localization.rs`. + +## Important developer workflows & commands +- Recommended toolchain: Rust stable per `rust-toolchain.toml` but README suggests nightly for some builds. If you can't use nightly, `RUSTC_BOOTSTRAP=1` is an alternative. +- Debug (local): + - cargo build (normal) + - cargo run -p edit -- (run the editor binary) + - To enable latency instrumentation: `cargo build --package edit --features debug-latency` (this is used in the TUI loop). +- Release: follow README — either + - `cargo build --config .cargo/release.toml --release` (older rust) or + - `cargo build --config .cargo/release-nightly.toml --release` (when README instructs nightly-specific config). +- Tests and ignored tests: `cargo test`. Some tests are intentionally marked ignored and require environment tweaks — run `cargo test -- --ignored` to exercise them. +- Benchmarks: `cargo bench` (project uses `criterion`). + +## Build-time environment and integration points +- i18n: `build/main.rs` reads `i18n/edit.toml` and writes `OUT_DIR/i18n_edit.rs`. Rebuild on changes to `i18n/edit.toml`. +- ICU (optional): the build script emits env vars consumed by code. Important envs: + - `EDIT_CFG_ICUUC_SONAME`, `EDIT_CFG_ICUI18N_SONAME`, `EDIT_CFG_ICU_CPP_EXPORTS`, `EDIT_CFG_ICU_RENAMING_VERSION`, `EDIT_CFG_ICU_RENAMING_AUTO_DETECT`. + - These affect `src/sys/*.rs` and `src/icu.rs` where dynamic symbol names are composed using `env!("EDIT_CFG_...")`. + - Example (Linux): `EDIT_CFG_ICUUC_SONAME=libicuuc.so EDIT_CFG_ICUI18N_SONAME=libicui18n.so cargo build`. + +## Project-specific patterns and conventions +- Arena-first memory: many data structures use `Arena` and `ArenaString` for short-lived allocations. Prefer these for UI paths and avoid heap-allocating large temporaries. +- `scratch_arena(None)` is commonly used to create ephemeral buffers inside hot loops — be careful when refactoring to not extend lifetimes. +- UI code is modularized as `draw_*` modules under `src/bin/edit/` (e.g., `draw_editor.rs`, `draw_menubar.rs`) — add UI elements by following the existing `draw_*` patterns. +- Buffer logic is in `buffer/` (gap buffer, line cache, navigation). Changes that affect on-disk or undo behavior likely live there. +- Platform-specific code in `src/sys` is authoritative for terminal and file IO. Changes to terminal modes, raw input, or clipboard should be made there. +- Use existing macro helpers: `arena_format!`, `KIBI`, `MEBI`, and `MetricFormatter` when producing formatted strings to match style and memory usage. + +## Where to look for common tasks (quick map) +- Add a UI control/widget: `src/bin/edit/draw_*.rs` + `src/bin/edit/state.rs`. +- Change buffer semantics: `buffer/gap_buffer.rs`, `buffer/line_cache.rs`, `buffer/navigation.rs`. +- Localization strings: `i18n/edit.toml` -> `build/main.rs` generates `OUT_DIR/i18n_edit.rs` -> included in `src/bin/edit/localization.rs`. +- Terminal/TTY behavior: `src/sys/unix.rs` and `src/sys/windows.rs` and `src/bin/edit/main.rs` (setup_terminal, RestoreModes). +- Low-level unicode/width handling: `src/unicode/*` and `src/icu.rs`. + +## Safety and testing notes for agents +- Avoid modifying unsafe, low-level allocator and lifetime-sensitive code (arena, buffer internals) without running quick checks — these are delicate and rely on invariants across many modules. +- Many behaviors are environment-dependent (terminals, ICU libs). When adding tests, prefer unit tests in `src/` that don't rely on terminal IO; integration tests that require terminal emulation are flakier. + +## Example quick tasks and references +- Run editor on a file: `cargo run -p edit -- README.md` — useful to manually validate UI changes. +- Find localization usage: `src/bin/edit/localization.rs` (includes generated file) and `i18n/edit.toml` (source). +- See platform wrappers: `src/sys/mod.rs`, `src/sys/unix.rs`, `src/sys/windows.rs`. + +If anything here is unclear or you'd like additional examples (e.g., small code-change + test cycle for a UI change), tell me which area to expand and I will iterate. diff --git a/src/bin/edit/documents.rs b/src/bin/edit/documents.rs index 33fc8cf5a76d..b368973a4930 100644 --- a/src/bin/edit/documents.rs +++ b/src/bin/edit/documents.rs @@ -64,18 +64,31 @@ impl Document { self.filename = filename; self.dir = Some(DisplayablePathBuf::from_path(dir)); self.path = Some(path); - self.update_file_mode(); } - fn update_file_mode(&mut self) { + fn apply_file_mode(&mut self, exp_highlighting: bool) { + self.update_file_mode(exp_highlighting); + } + + fn update_file_mode(&mut self, exp_highlighting: bool) { let mut tb = self.buffer.borrow_mut(); tb.set_ruler(if self.filename == "COMMIT_EDITMSG" { 72 } else { 0 }); + + // Syntax highlighting is disabled by default. + // Only enable when experimental highlighting flag is passed. + tb.set_syntax_highlight_enabled(exp_highlighting); } } -#[derive(Default)] pub struct DocumentManager { list: LinkedList, + exp_highlighting: bool, +} + +impl Default for DocumentManager { + fn default() -> Self { + Self { list: LinkedList::new(), exp_highlighting: false } + } } impl DocumentManager { @@ -84,6 +97,10 @@ impl DocumentManager { self.list.len() } + pub fn set_exp_highlighting(&mut self, enabled: bool) { + self.exp_highlighting = enabled; + } + #[inline] pub fn active(&self) -> Option<&Document> { self.list.front() @@ -183,6 +200,7 @@ impl DocumentManager { new_file_counter: 0, }; doc.set_path(path); + doc.apply_file_mode(self.exp_highlighting); if let Some(active) = self.active() && active.path.is_none() diff --git a/src/bin/edit/main.rs b/src/bin/edit/main.rs index 326c88a02ee2..e45ab293b6dc 100644 --- a/src/bin/edit/main.rs +++ b/src/bin/edit/main.rs @@ -248,6 +248,10 @@ fn handle_args(state: &mut State) -> apperr::Result { print_version(); return Ok(true); } + if arg == "--enable-exp-highlighting" { + state.exp_highlighting = true; + continue; + } } let p = cwd.join(Path::new(&arg)); @@ -260,6 +264,9 @@ fn handle_args(state: &mut State) -> apperr::Result { } } + // Apply experimental highlighting setting before adding documents + state.documents.set_exp_highlighting(state.exp_highlighting); + for p in &paths { state.documents.add_file_path(p)?; } @@ -288,8 +295,9 @@ fn print_help() { sys::write_stdout(concat!( "Usage: edit [OPTIONS] [FILE[:LINE[:COLUMN]]]\n", "Options:\n", - " -h, --help Print this help message\n", - " -v, --version Print the version number\n", + " -h, --help Print this help message\n", + " -v, --version Print the version number\n", + " --enable-exp-highlighting Enable experimental syntax highlighting\n", "\n", "Arguments:\n", " FILE[:LINE[:COLUMN]] The file to open, optionally with line and column (e.g., foo.txt:123:45)\n", diff --git a/src/bin/edit/state.rs b/src/bin/edit/state.rs index 451060bf6db4..5dee0e49dae7 100644 --- a/src/bin/edit/state.rs +++ b/src/bin/edit/state.rs @@ -172,6 +172,8 @@ pub struct State { pub osc_clipboard_sync: bool, pub osc_clipboard_always_send: bool, pub exit: bool, + + pub exp_highlighting: bool, } impl State { @@ -220,6 +222,8 @@ impl State { osc_clipboard_sync: false, osc_clipboard_always_send: false, exit: false, + + exp_highlighting: false, }) } } diff --git a/src/buffer/mod.rs b/src/buffer/mod.rs index 6f0a714744e4..2875da9d203d 100644 --- a/src/buffer/mod.rs +++ b/src/buffer/mod.rs @@ -25,7 +25,7 @@ mod navigation; use std::borrow::Cow; use std::cell::UnsafeCell; -use std::collections::LinkedList; +use std::collections::{HashMap, LinkedList}; use std::fmt::Write as _; use std::fs::File; use std::io::{Read as _, Write as _}; @@ -245,6 +245,11 @@ pub struct TextBuffer { overtype: bool, wants_cursor_visibility: bool, + // Cache of tokenization results keyed by the starting byte-offset of + // the displayed fragment. + token_cache: HashMap>, + // Whether syntax highlighting is enabled for this buffer. + syntax_highlight_enabled: bool, } impl TextBuffer { @@ -293,6 +298,8 @@ impl TextBuffer { overtype: false, wants_cursor_visibility: false, + token_cache: HashMap::new(), + syntax_highlight_enabled: true, }) } @@ -655,6 +662,9 @@ impl TextBuffer { self.cursor = Default::default(); self.set_selection(None); self.mark_as_clean(); + // Clear token cache because the whole buffer changed. + self.token_cache.clear(); + // Keep highlighting enabled by default; caller may toggle. self.reflow(); } @@ -1965,6 +1975,49 @@ impl TextBuffer { fb.replace_text(destination.top + y, destination.left, destination.right, &line); + // Basic generic syntax highlighting (display-line tokenizer). + // Use a per-fragment cache keyed by the starting byte offset of the + // displayed fragment (`cursor_beg.offset`). This avoids re-tokenizing + // unchanged fragments. Only run when enabled. + let start_offset = cursor_beg.offset; + let tokens = if self.syntax_highlight_enabled { + if let Some(cached) = self.token_cache.get(&start_offset) { + cached.clone() + } else { + // Skip margin characters to tokenize only text content. + // margin_width is in visual columns, which equals char count for the margin. + let margin_chars = self.margin_width as usize; + let text_start = + line.char_indices().nth(margin_chars).map_or(line.len(), |(i, _)| i); + let t = crate::syntax::tokenize_display_line(&line[text_start..]); + self.token_cache.insert(start_offset, t.clone()); + t + } + } else { + Vec::new() + }; + + for tok in tokens.iter() { + if matches!(tok.kind, crate::syntax::TokenKind::Whitespace) { + continue; + } + + let left = destination.left + self.margin_width + tok.start as CoordType; + let right = left + (tok.end.saturating_sub(tok.start)) as CoordType; + if left >= destination.right || right <= destination.left { + continue; + } + + let rect = Rect { + left: left.max(destination.left), + top: destination.top + y, + right: right.min(destination.right), + bottom: destination.top + y + 1, + }; + let color = crate::syntax::token_kind_color(tok.kind); + fb.blend_fg(rect, fb.indexed(color)); + } + cursor = cursor_end; } @@ -2080,6 +2133,19 @@ impl TextBuffer { self.write(text, self.cursor, true); } + /// Enable or disable syntax highlighting for this buffer. + pub fn set_syntax_highlight_enabled(&mut self, enabled: bool) { + if self.syntax_highlight_enabled != enabled { + self.syntax_highlight_enabled = enabled; + self.token_cache.clear(); + } + } + + /// Returns whether syntax highlighting is enabled for this buffer. + pub fn is_syntax_highlight_enabled(&self) -> bool { + self.syntax_highlight_enabled + } + fn write(&mut self, text: &[u8], at: Cursor, raw: bool) { let history_type = if raw { HistoryType::Other } else { HistoryType::Write }; let mut edit_begun = false; @@ -2611,6 +2677,17 @@ impl TextBuffer { fn edit_write(&mut self, text: &[u8]) { let logical_y_before = self.cursor.logical_pos.y; + // Invalidate token cache entries starting at/after the line that contains + // the active edit offset. This makes the cache per-line relative to + // fragment starting offsets and avoids full-cache clears for small edits. + if self.syntax_highlight_enabled { + let off = self.active_edit_off; + let cursor_at_off = self.cursor_move_to_offset_internal(self.cursor, off); + let start_cursor = self.goto_line_start(cursor_at_off, cursor_at_off.logical_pos.y); + let start_off = start_cursor.offset; + self.token_cache.retain(|&k, _| k < start_off); + } + // Copy the written portion into the undo entry. { let mut undo = self.undo_stack.back_mut().unwrap().borrow_mut(); @@ -2636,6 +2713,16 @@ impl TextBuffer { let off = self.active_edit_off; let mut out_off = usize::MAX; + // Invalidate token cache entries starting at/after the line that contains + // the deletion start offset (`off`). This prevents stale tokens from + // being reused after deletion. + if self.syntax_highlight_enabled { + let cursor_at_off = self.cursor_move_to_offset_internal(self.cursor, off); + let start_cursor = self.goto_line_start(cursor_at_off, cursor_at_off.logical_pos.y); + let start_off = start_cursor.offset; + self.token_cache.retain(|&k, _| k < start_off); + } + let mut undo = self.undo_stack.back_mut().unwrap().borrow_mut(); // If this is a continued backspace operation, diff --git a/src/lib.rs b/src/lib.rs index 4a150da197b5..04439c9e738f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -39,3 +39,4 @@ pub mod sys; pub mod tui; pub mod unicode; pub mod vt; +pub mod syntax; diff --git a/src/syntax.rs b/src/syntax.rs new file mode 100644 index 000000000000..6e41d6ed8938 --- /dev/null +++ b/src/syntax.rs @@ -0,0 +1,179 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +use crate::framebuffer::IndexedColor; + +/// A token kind for the display-level generic tokenizer. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum TokenKind { + Comment, + String, + Number, + Identifier, + Punctuation, + Whitespace, + Other, +} + +/// A token within a display line measured in character columns (approximate). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub struct Token { + pub kind: TokenKind, + /// Start column (inclusive) within the display line. + pub start: usize, + /// End column (exclusive) within the display line. + pub end: usize, +} + +/// Simple, fast, single-pass tokenizer that operates on the already-processed +/// display line (tabs expanded, control glyphs replaced). It intentionally +/// keeps things minimal and avoids allocations where possible. +pub fn tokenize_display_line(line: &str) -> Vec { + let mut out = Vec::new(); + let mut iter = line.chars().peekable(); + let mut char_idx = 0usize; + + while let Some(&ch) = iter.peek() { + let start = char_idx; + + // Whitespace run + if ch.is_whitespace() { + iter.next(); char_idx += 1; + while let Some(&c) = iter.peek() { + if !c.is_whitespace() { break; } + iter.next(); char_idx += 1; + } + let end = char_idx; + out.push(Token { kind: TokenKind::Whitespace, start, end }); + continue; + } + + // Line comment starting with '#' + if ch == '#' { + iter.next(); char_idx += 1; + while iter.next().is_some() { char_idx += 1; } + let end = char_idx; + out.push(Token { kind: TokenKind::Comment, start, end }); + break; + } + + // Possible '//' comment or punctuation '/' + if ch == '/' { + iter.next(); char_idx += 1; + if let Some(&'/') = iter.peek() { + iter.next(); char_idx += 1; + while iter.next().is_some() { char_idx += 1; } + let end = char_idx; + out.push(Token { kind: TokenKind::Comment, start, end }); + break; + } else { + let end = char_idx; + out.push(Token { kind: TokenKind::Punctuation, start, end }); + continue; + } + } + + // Strings + if ch == '"' || ch == '\'' { + let quote = ch; + iter.next(); char_idx += 1; + let mut escaped = false; + while let Some(c) = iter.next() { + char_idx += 1; + if escaped { escaped = false; continue; } + if c == '\\' { escaped = true; continue; } + if c == quote { break; } + } + let end = char_idx; + out.push(Token { kind: TokenKind::String, start, end }); + continue; + } + + // Numbers + if ch.is_ascii_digit() { + iter.next(); char_idx += 1; + while let Some(&c) = iter.peek() { + if c.is_ascii_digit() || c == '.' || c == '_' { iter.next(); char_idx += 1; } else { break } + } + let end = char_idx; + out.push(Token { kind: TokenKind::Number, start, end }); + continue; + } + + // Identifier + if ch.is_alphabetic() || ch == '_' { + iter.next(); char_idx += 1; + while let Some(&c) = iter.peek() { + if c.is_alphanumeric() || c == '_' { iter.next(); char_idx += 1; } else { break } + } + let end = char_idx; + out.push(Token { kind: TokenKind::Identifier, start, end }); + continue; + } + + // Punctuation single char + iter.next(); char_idx += 1; + let end = char_idx; + out.push(Token { kind: TokenKind::Punctuation, start, end }); + } + + out +} + +/// Maps token kinds to an `IndexedColor` from the basic 8-color palette. +pub fn token_kind_color(kind: TokenKind) -> IndexedColor { + match kind { + TokenKind::Comment => IndexedColor::Green, + TokenKind::String => IndexedColor::Red, + TokenKind::Number => IndexedColor::Magenta, + TokenKind::Identifier => IndexedColor::Cyan, + TokenKind::Punctuation => IndexedColor::Yellow, + TokenKind::Whitespace => IndexedColor::White, + TokenKind::Other => IndexedColor::White, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn tokenize_basic_line() { + let s = "let x = 42; // comment"; + let toks = tokenize_display_line(s); + let kinds: Vec = toks.iter().map(|t| t.kind).collect(); + assert_eq!(kinds[0], TokenKind::Identifier); // "let" + assert_eq!(kinds[kinds.len() - 1], TokenKind::Comment); + + // Verify spans for a couple tokens + assert_eq!(toks[0].start, 0); + assert_eq!(toks[0].end, 3); // "let" + // number token should cover "42" + let num_tok = toks.iter().find(|t| t.kind == TokenKind::Number).unwrap(); + let num_text: String = s.chars().skip(num_tok.start).take(num_tok.end - num_tok.start).collect(); + assert_eq!(num_text, "42"); + } + + #[test] + fn tokenize_string_and_ident() { + let s = "\"hello\" world"; + let toks = tokenize_display_line(s); + assert_eq!(toks[0].kind, TokenKind::String); + let str_text: String = s.chars().skip(toks[0].start).take(toks[0].end - toks[0].start).collect(); + assert_eq!(str_text, "\"hello\""); + assert_eq!(toks[1].kind, TokenKind::Whitespace); + assert_eq!(toks[2].kind, TokenKind::Identifier); + let id_text: String = s.chars().skip(toks[2].start).take(toks[2].end - toks[2].start).collect(); + assert_eq!(id_text, "world"); + } + + #[test] + fn tokenize_hash_comment() { + let s = " #hi"; + let toks = tokenize_display_line(s); + assert_eq!(toks[0].kind, TokenKind::Whitespace); + assert_eq!(toks[1].kind, TokenKind::Comment); + let c_text: String = s.chars().skip(toks[1].start).take(toks[1].end - toks[1].start).collect(); + assert_eq!(c_text, "#hi"); + } +} diff --git a/src/sys/unix.rs b/src/sys/unix.rs index f3b067bd0366..ca1f0513dd12 100644 --- a/src/sys/unix.rs +++ b/src/sys/unix.rs @@ -76,7 +76,7 @@ pub fn switch_modes() -> apperr::Result<()> { // Set STATE.inject_resize to true whenever we get a SIGWINCH. let mut sigwinch_action: libc::sigaction = mem::zeroed(); - sigwinch_action.sa_sigaction = sigwinch_handler as libc::sighandler_t; + sigwinch_action.sa_sigaction = sigwinch_handler as *const c_void as libc::sighandler_t; check_int_return(libc::sigaction(libc::SIGWINCH, &sigwinch_action, null_mut()))?; // Get the original terminal modes so we can disable raw mode on exit.