diff --git a/src/format.rs b/src/format.rs index d67944af..cb1f0a87 100644 --- a/src/format.rs +++ b/src/format.rs @@ -620,7 +620,7 @@ impl<'a> Formatter<'a> { } self.output - .push_str(&crate::parse::canonicalize_exclams(&binding.name.value)); + .push_str(&crate::parse::canonicalize_ident(&binding.name.value)); self.output .push_str(if binding.public { " ←" } else { " ↚" }); if binding.array_macro { diff --git a/src/lex.rs b/src/lex.rs index 28f06f42..2453d876 100644 --- a/src/lex.rs +++ b/src/lex.rs @@ -18,6 +18,9 @@ use unicode_segmentation::UnicodeSegmentation; use crate::{ast::PlaceholderOp, ArraySwizzle, Inputs, Primitive, StackSwizzle, WILDCARD_CHAR}; +/// Subscript digit characters +pub const SUBSCRIPT_NUMS: [char; 10] = ['₀', '₁', '₂', '₃', '₄', '₅', '₆', '₇', '₈', '₉']; + /// Lex a Uiua source file pub fn lex( input: &str, @@ -793,7 +796,7 @@ impl<'a> Lexer<'a> { "]" => self.end(CloseBracket, start), "⟨" => self.end(OpenAngle, start), "⟩" => self.end(CloseAngle, start), - "_" => self.end(Underscore, start), + "_" if self.peek_char() != Some("_") => self.end(Underscore, start), "|" => self.end(Bar, start), ";" => self.end(Semicolon, start), "-" if self.next_chars_exact(["-", "-"]) => self.end(TripleMinus, start), @@ -1002,12 +1005,28 @@ impl<'a> Lexer<'a> { } } // Identifiers and unformatted glyphs - c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" => { + c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" || c == "_" => { let mut ident = c.to_string(); // Collect characters if !is_custom_glyph(c) { - while let Some(c) = self.next_char_if_all(is_ident_char) { - ident.push_str(c); + // Handle identifiers beginning with __ + if c == "_" && self.next_char_exact("_") { + ident.push_str("__"); + while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) { + ident.push_str(dc); + } + } + loop { + if let Some(c) = self.next_char_if_all(is_ident_char) { + ident.push_str(c); + } else if self.next_chars_exact(["_"; 2]) { + ident.push_str("__"); + while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) { + ident.push_str(dc); + } + } else { + break; + } } } let mut exclam_count = 0; @@ -1376,7 +1395,7 @@ fn parse_format_fragments(s: &str) -> Vec { /// Whether a character can be part of a Uiua identifier pub fn is_ident_char(c: char) -> bool { - c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c) + c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c) || SUBSCRIPT_NUMS.contains(&c) } /// Whether a string is a custom glyph diff --git a/src/parse.rs b/src/parse.rs index a0fc81eb..08b09ca1 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -478,7 +478,7 @@ impl<'i> Parser<'i> { match token { Token::Ident if line.is_some() => { let line = line.as_mut().unwrap(); - let ident = canonicalize_exclams(&self.input[span.byte_range()]); + let ident = canonicalize_ident(&self.input[span.byte_range()]); let name = span.clone().sp(ident); line.items.push(name); } @@ -523,8 +523,8 @@ impl<'i> Parser<'i> { } fn try_ident(&mut self) -> Option> { let span = self.try_exact(Token::Ident)?; - let s: Ident = canonicalize_exclams(&self.input[span.byte_range()]); - Some(span.sp(s)) + let ident = canonicalize_ident(&self.input[span.byte_range()]); + Some(span.sp(ident)) } fn try_ref(&mut self) -> Option> { let mut checkpoint = self.index; @@ -1348,11 +1348,32 @@ pub fn place_exclams(ident: &str, count: usize) -> Ident { } /// Rewrite the identifier with the same number of exclamation points using double and single exclamation point characters as needed -pub fn canonicalize_exclams(ident: &str) -> Ident { +fn canonicalize_exclams(ident: &str) -> Ident { let num_margs = crate::parse::ident_modifier_args(ident); place_exclams(ident, num_margs) } +/// Rewrite the identifier with numerals preceded by `__` replaced with subscript characters +fn canonicalize_subscripts(ident: &str) -> Ident { + // This hasty canonicalization is okay because the stricter + // rules about the syntax are handled in the lexer + ident + .chars() + .filter(|c| *c != '_') + .map(|c| { + if let Some(d) = c.to_digit(10) { + crate::lex::SUBSCRIPT_NUMS[d as usize] + } else { + c + } + }) + .collect() +} + +pub fn canonicalize_ident(ident: &str) -> Ident { + canonicalize_subscripts(&canonicalize_exclams(ident)) +} + pub(crate) fn count_placeholders(words: &[Sp]) -> usize { let mut count = 0; for word in words {