From 852983b7ede0a97c59963d3934543051b66aae01 Mon Sep 17 00:00:00 2001 From: Kai Schmidt Date: Wed, 18 Sep 2024 09:29:19 -0700 Subject: [PATCH] fix another lexing bug --- src/format.rs | 11 ++++++----- src/lex.rs | 47 ++++++++++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 24 deletions(-) diff --git a/src/format.rs b/src/format.rs index 24255bbfa..c44581570 100644 --- a/src/format.rs +++ b/src/format.rs @@ -16,7 +16,8 @@ use paste::paste; use crate::{ ast::*, grid_fmt::GridFmt, - lex::{is_ident_char, CodeSpan, Loc, Sp}, + is_ident_start, + lex::{CodeSpan, Loc, Sp}, parse::{flip_unsplit_lines, parse, split_words, trim_spaces}, Compiler, FunctionId, Ident, InputSrc, Inputs, PreEvalMode, Primitive, RunMode, SafeSys, Signature, Uiua, UiuaErrorKind, UiuaResult, Value, SUBSCRIPT_NUMS, @@ -748,7 +749,7 @@ impl<'a> Formatter<'a> { self.format_ref_path(&r.path); if r.path.is_empty() && r.name.value.starts_with(|c: char| c.is_lowercase()) - && (self.output.chars().last()).is_some_and(|c| c.is_lowercase() && is_ident_char(c)) + && (self.output.chars().last()).is_some_and(|c| c.is_lowercase() && is_ident_start(c)) { self.output.push(' '); } @@ -758,7 +759,7 @@ impl<'a> Formatter<'a> { if let Some(first) = comps.first() { if first.module.value.starts_with(|c: char| c.is_lowercase()) && (self.output.chars().last()) - .is_some_and(|c| c.is_lowercase() && is_ident_char(c)) + .is_some_and(|c| c.is_lowercase() && is_ident_start(c)) { self.output.push(' '); } @@ -887,7 +888,7 @@ impl<'a> Formatter<'a> { } Word::Ref(r) => { if (self.output.chars().rev()) - .take_while(|&c| is_ident_char(c)) + .take_while(|&c| is_ident_start(c)) .any(|c| c.is_uppercase()) { self.output.push(' '); @@ -896,7 +897,7 @@ impl<'a> Formatter<'a> { } Word::IncompleteRef { path, .. } => { if (self.output.chars().rev()) - .take_while(|&c| is_ident_char(c)) + .take_while(|&c| is_ident_start(c)) .any(|c| c.is_uppercase()) { self.output.push(' '); diff --git a/src/lex.rs b/src/lex.rs index 5689e9d40..09ec3dde2 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -888,8 +888,17 @@ impl<'a> Lexer<'a> { "_" => { if self.next_char_exact("_") { let mut n = 0; - while let Some(c) = self.next_char_if_all(|c| c.is_ascii_digit()) { - n = n * 10 + c.parse::().unwrap(); + loop { + if let Some(c) = self.next_char_if_all(|c| c.is_ascii_digit()) { + n = n * 10 + c.parse::().unwrap(); + } else if let Some(c) = + self.next_char_if_all(|c| SUBSCRIPT_NUMS.contains(&c)) + { + let c = c.chars().next().unwrap(); + n = n * 10 + SUBSCRIPT_NUMS.iter().position(|&d| d == c).unwrap(); + } else { + break; + } } self.end(Subscript(n), start) } else { @@ -1305,25 +1314,24 @@ impl<'a> Lexer<'a> { return s; } if !is_custom_glyph(c) { + let mut started_subscript = false; // Handle identifiers beginning with __ - if c == "_" && self.next_char_exact("_") { - s.push('_'); - while let Some(c) = self.next_char_if_all(|c| c.is_ascii_digit()) { - s.push_str(c); - } - } else { - loop { - if let Some(c) = self.next_char_if_all(is_ident_char) { + loop { + if self.next_chars_exact(["_"; 2]) { + s.push_str("__"); + while let Some(c) = self.next_char_if_all(|c| c.is_ascii_digit()) { s.push_str(c); - } else if self.next_chars_exact(["_"; 2]) { - s.push_str("__"); - while let Some(c) = self.next_char_if_all(|c| c.is_ascii_digit()) { - s.push_str(c); - } - break; - } else { - break; } + started_subscript = true; + } else if let Some(c) = + self.next_char_if_all(|c| !started_subscript && is_ident_start(c)) + { + s.push_str(c); + } else if let Some(c) = self.next_char_if_all(|c| SUBSCRIPT_NUMS.contains(&c)) { + s.push_str(c); + started_subscript = true; + } else { + break; } } } @@ -1561,7 +1569,8 @@ pub fn is_ident_char(c: char) -> bool { is_ident_start(c) || SUBSCRIPT_NUMS.contains(&c) } -fn is_ident_start(c: char) -> bool { +/// Whether a character can be among the first characters of a Uiua identifier +pub fn is_ident_start(c: char) -> bool { c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c) }