uiua-lang · kaikalii · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/src/format.rs b/src/format.rs
@@ -620,7 +620,7 @@ impl<'a> Formatter<'a> {
  }
 
  self.output
- .push_str(&crate::parse::canonicalize_exclams(&binding.name.value));
+ .push_str(&crate::parse::canonicalize_ident(&binding.name.value));
  self.output
  .push_str(if binding.public { " ←" } else { " ↚" });
  if binding.array_macro {

diff --git a/src/lex.rs b/src/lex.rs
@@ -18,6 +18,9 @@ use unicode_segmentation::UnicodeSegmentation;
 
 use crate::{ast::PlaceholderOp, ArraySwizzle, Inputs, Primitive, StackSwizzle, WILDCARD_CHAR};
 
+/// Subscript digit characters
+pub const SUBSCRIPT_NUMS: [char; 10] = ['₀', '₁', '₂', '₃', '₄', '₅', '₆', '₇', '₈', '₉'];
+
 /// Lex a Uiua source file
 pub fn lex(
  input: &str,
@@ -793,7 +796,7 @@ impl<'a> Lexer<'a> {
  "]" => self.end(CloseBracket, start),
  "⟨" => self.end(OpenAngle, start),
  "⟩" => self.end(CloseAngle, start),
- "_" => self.end(Underscore, start),
+ "_" if self.peek_char() != Some("_") => self.end(Underscore, start),
  "|" => self.end(Bar, start),
  ";" => self.end(Semicolon, start),
  "-" if self.next_chars_exact(["-", "-"]) => self.end(TripleMinus, start),
@@ -1002,12 +1005,28 @@ impl<'a> Lexer<'a> {
  }
  }
  // Identifiers and unformatted glyphs
- c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" => {
+ c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" || c == "_" => {
  let mut ident = c.to_string();
  // Collect characters
  if !is_custom_glyph(c) {
- while let Some(c) = self.next_char_if_all(is_ident_char) {
- ident.push_str(c);
+ // Handle identifiers beginning with __
+ if c == "_" && self.next_char_exact("_") {
+ ident.push_str("__");
+ while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) {
+ ident.push_str(dc);
+ }
+ }
+ loop {
+ if let Some(c) = self.next_char_if_all(is_ident_char) {
+ ident.push_str(c);
+ } else if self.next_chars_exact(["_"; 2]) {
+ ident.push_str("__");
+ while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) {
+ ident.push_str(dc);
+ }
+ } else {
+ break;
+ }
  }
  }
  let mut exclam_count = 0;
@@ -1376,7 +1395,7 @@ fn parse_format_fragments(s: &str) -> Vec<String> {
 
 /// Whether a character can be part of a Uiua identifier
 pub fn is_ident_char(c: char) -> bool {
- c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c)
+ c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c) || SUBSCRIPT_NUMS.contains(&c)
 }
 
 /// Whether a string is a custom glyph

diff --git a/src/parse.rs b/src/parse.rs
@@ -478,7 +478,7 @@ impl<'i> Parser<'i> {
  match token {
  Token::Ident if line.is_some() => {
  let line = line.as_mut().unwrap();
- let ident = canonicalize_exclams(&self.input[span.byte_range()]);
+ let ident = canonicalize_ident(&self.input[span.byte_range()]);
  let name = span.clone().sp(ident);
  line.items.push(name);
  }
@@ -523,8 +523,8 @@ impl<'i> Parser<'i> {
  }
  fn try_ident(&mut self) -> Option<Sp<Ident>> {
  let span = self.try_exact(Token::Ident)?;
- let s: Ident = canonicalize_exclams(&self.input[span.byte_range()]);
- Some(span.sp(s))
+ let ident = canonicalize_ident(&self.input[span.byte_range()]);
+ Some(span.sp(ident))
  }
  fn try_ref(&mut self) -> Option<Sp<Word>> {
  let mut checkpoint = self.index;
@@ -1348,11 +1348,32 @@ pub fn place_exclams(ident: &str, count: usize) -> Ident {
 }
 
 /// Rewrite the identifier with the same number of exclamation points using double and single exclamation point characters as needed
-pub fn canonicalize_exclams(ident: &str) -> Ident {
+fn canonicalize_exclams(ident: &str) -> Ident {
  let num_margs = crate::parse::ident_modifier_args(ident);
  place_exclams(ident, num_margs)
 }
 
+/// Rewrite the identifier with numerals preceded by `__` replaced with subscript characters
+fn canonicalize_subscripts(ident: &str) -> Ident {
+ // This hasty canonicalization is okay because the stricter
+ // rules about the syntax are handled in the lexer
+ ident
+ .chars()
+ .filter(|c| *c != '_')
+ .map(|c| {
+ if let Some(d) = c.to_digit(10) {
+ crate::lex::SUBSCRIPT_NUMS[d as usize]
+ } else {
+ c
+ }
+ })
+ .collect()
+}
+
+pub fn canonicalize_ident(ident: &str) -> Ident {
+ canonicalize_subscripts(&canonicalize_exclams(ident))
+}
+
 pub(crate) fn count_placeholders(words: &[Sp<Word>]) -> usize {
  let mut count = 0;
  for word in words {