Allow subscript numbers in identifiers

Omnikar · Omnikar · commit e29f4c093ffe · 2024-06-27T01:28:23.000-04:00
Subscripts will format from `__` followed by consecutive digits.
diff --git a/src/lex.rs b/src/lex.rs
@@ -18,6 +18,9 @@ use unicode_segmentation::UnicodeSegmentation;
 
 use crate::{ast::PlaceholderOp, ArraySwizzle, Inputs, Primitive, StackSwizzle, WILDCARD_CHAR};
 
+/// Subscript digit characters
+pub const SUBSCRIPT_NUMS: [char; 10] = ['₀', '₁', '₂', '₃', '₄', '₅', '₆', '₇', '₈', '₉'];
+
 /// Lex a Uiua source file
 pub fn lex(
     input: &str,
@@ -793,7 +796,7 @@ impl<'a> Lexer<'a> {
                 "]" => self.end(CloseBracket, start),
                 "⟨" => self.end(OpenAngle, start),
                 "⟩" => self.end(CloseAngle, start),
-                "_" => self.end(Underscore, start),
+                "_" if self.peek_char() != Some("_") => self.end(Underscore, start),
                 "|" => self.end(Bar, start),
                 ";" => self.end(Semicolon, start),
                 "-" if self.next_chars_exact(["-", "-"]) => self.end(TripleMinus, start),
@@ -1002,12 +1005,28 @@ impl<'a> Lexer<'a> {
                     }
                 }
                 // Identifiers and unformatted glyphs
-                c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" => {
+                c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" || c == "_" => {
                     let mut ident = c.to_string();
                     // Collect characters
                     if !is_custom_glyph(c) {
-                        while let Some(c) = self.next_char_if_all(is_ident_char) {
-                            ident.push_str(c);
+                        // Handle identifiers beginning with __
+                        if c == "_" && self.next_char_exact("_") {
+                            ident.push_str("__");
+                            while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) {
+                                ident.push_str(dc);
+                            }
+                        }
+                        loop {
+                            if let Some(c) = self.next_char_if_all(is_ident_char) {
+                                ident.push_str(c);
+                            } else if self.next_chars_exact(["_"; 2]) {
+                                ident.push_str("__");
+                                while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) {
+                                    ident.push_str(dc);
+                                }
+                            } else {
+                                break;
+                            }
                         }
                     }
                     let mut exclam_count = 0;
@@ -1376,7 +1395,7 @@ fn parse_format_fragments(s: &str) -> Vec<String> {
 
 /// Whether a character can be part of a Uiua identifier
 pub fn is_ident_char(c: char) -> bool {
-    c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c)
+    c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c) || SUBSCRIPT_NUMS.contains(&c)
 }
 
 /// Whether a string is a custom glyph
diff --git a/src/parse.rs b/src/parse.rs
@@ -479,6 +479,7 @@ impl<'i> Parser<'i> {
                 Token::Ident if line.is_some() => {
                     let line = line.as_mut().unwrap();
                     let ident = canonicalize_exclams(&self.input[span.byte_range()]);
+                    let ident = canonicalize_subscripts(&ident);
                     let name = span.clone().sp(ident);
                     line.items.push(name);
                 }
@@ -523,8 +524,9 @@ impl<'i> Parser<'i> {
     }
     fn try_ident(&mut self) -> Option<Sp<Ident>> {
         let span = self.try_exact(Token::Ident)?;
-        let s: Ident = canonicalize_exclams(&self.input[span.byte_range()]);
-        Some(span.sp(s))
+        let ident = canonicalize_exclams(&self.input[span.byte_range()]);
+        let ident = canonicalize_subscripts(&ident);
+        Some(span.sp(ident))
     }
     fn try_ref(&mut self) -> Option<Sp<Word>> {
         let mut checkpoint = self.index;
@@ -1353,6 +1355,22 @@ pub fn canonicalize_exclams(ident: &str) -> Ident {
     place_exclams(ident, num_margs)
 }
 
+pub fn canonicalize_subscripts(ident: &str) -> Ident {
+    // This hasty canonicalization is okay because the stricter
+    // rules about the syntax are handled in the lexer
+    ident
+        .chars()
+        .filter(|c| *c != '_')
+        .map(|c| {
+            if let Some(d) = c.to_digit(10) {
+                crate::lex::SUBSCRIPT_NUMS[d as usize]
+            } else {
+                c
+            }
+        })
+        .collect()
+}
+
 pub(crate) fn count_placeholders(words: &[Sp<Word>]) -> usize {
     let mut count = 0;
     for word in words {