Skip to content

Commit e29f4c0

Browse files
committed
Allow subscript numbers in identifiers
Subscripts will format from `__` followed by consecutive digits.
1 parent 934fe4c commit e29f4c0

File tree

2 files changed

+44
-7
lines changed

2 files changed

+44
-7
lines changed

src/lex.rs

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ use unicode_segmentation::UnicodeSegmentation;
1818

1919
use crate::{ast::PlaceholderOp, ArraySwizzle, Inputs, Primitive, StackSwizzle, WILDCARD_CHAR};
2020

21+
/// Subscript digit characters
22+
pub const SUBSCRIPT_NUMS: [char; 10] = ['₀', '₁', '₂', '₃', '₄', '₅', '₆', '₇', '₈', '₉'];
23+
2124
/// Lex a Uiua source file
2225
pub fn lex(
2326
input: &str,
@@ -793,7 +796,7 @@ impl<'a> Lexer<'a> {
793796
"]" => self.end(CloseBracket, start),
794797
"⟨" => self.end(OpenAngle, start),
795798
"⟩" => self.end(CloseAngle, start),
796-
"_" => self.end(Underscore, start),
799+
"_" if self.peek_char() != Some("_") => self.end(Underscore, start),
797800
"|" => self.end(Bar, start),
798801
";" => self.end(Semicolon, start),
799802
"-" if self.next_chars_exact(["-", "-"]) => self.end(TripleMinus, start),
@@ -1002,12 +1005,28 @@ impl<'a> Lexer<'a> {
10021005
}
10031006
}
10041007
// Identifiers and unformatted glyphs
1005-
c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" => {
1008+
c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" || c == "_" => {
10061009
let mut ident = c.to_string();
10071010
// Collect characters
10081011
if !is_custom_glyph(c) {
1009-
while let Some(c) = self.next_char_if_all(is_ident_char) {
1010-
ident.push_str(c);
1012+
// Handle identifiers beginning with __
1013+
if c == "_" && self.next_char_exact("_") {
1014+
ident.push_str("__");
1015+
while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) {
1016+
ident.push_str(dc);
1017+
}
1018+
}
1019+
loop {
1020+
if let Some(c) = self.next_char_if_all(is_ident_char) {
1021+
ident.push_str(c);
1022+
} else if self.next_chars_exact(["_"; 2]) {
1023+
ident.push_str("__");
1024+
while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) {
1025+
ident.push_str(dc);
1026+
}
1027+
} else {
1028+
break;
1029+
}
10111030
}
10121031
}
10131032
let mut exclam_count = 0;
@@ -1376,7 +1395,7 @@ fn parse_format_fragments(s: &str) -> Vec<String> {
13761395

13771396
/// Whether a character can be part of a Uiua identifier
13781397
pub fn is_ident_char(c: char) -> bool {
1379-
c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c)
1398+
c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c) || SUBSCRIPT_NUMS.contains(&c)
13801399
}
13811400

13821401
/// Whether a string is a custom glyph

src/parse.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,7 @@ impl<'i> Parser<'i> {
479479
Token::Ident if line.is_some() => {
480480
let line = line.as_mut().unwrap();
481481
let ident = canonicalize_exclams(&self.input[span.byte_range()]);
482+
let ident = canonicalize_subscripts(&ident);
482483
let name = span.clone().sp(ident);
483484
line.items.push(name);
484485
}
@@ -523,8 +524,9 @@ impl<'i> Parser<'i> {
523524
}
524525
fn try_ident(&mut self) -> Option<Sp<Ident>> {
525526
let span = self.try_exact(Token::Ident)?;
526-
let s: Ident = canonicalize_exclams(&self.input[span.byte_range()]);
527-
Some(span.sp(s))
527+
let ident = canonicalize_exclams(&self.input[span.byte_range()]);
528+
let ident = canonicalize_subscripts(&ident);
529+
Some(span.sp(ident))
528530
}
529531
fn try_ref(&mut self) -> Option<Sp<Word>> {
530532
let mut checkpoint = self.index;
@@ -1353,6 +1355,22 @@ pub fn canonicalize_exclams(ident: &str) -> Ident {
13531355
place_exclams(ident, num_margs)
13541356
}
13551357

1358+
pub fn canonicalize_subscripts(ident: &str) -> Ident {
1359+
// This hasty canonicalization is okay because the stricter
1360+
// rules about the syntax are handled in the lexer
1361+
ident
1362+
.chars()
1363+
.filter(|c| *c != '_')
1364+
.map(|c| {
1365+
if let Some(d) = c.to_digit(10) {
1366+
crate::lex::SUBSCRIPT_NUMS[d as usize]
1367+
} else {
1368+
c
1369+
}
1370+
})
1371+
.collect()
1372+
}
1373+
13561374
pub(crate) fn count_placeholders(words: &[Sp<Word>]) -> usize {
13571375
let mut count = 0;
13581376
for word in words {

0 commit comments

Comments
 (0)