Skip to content

Commit

Permalink
Allow subscript numbers in identifiers
Browse files Browse the repository at this point in the history
Subscripts will format from `__` followed by consecutive digits.
  • Loading branch information
Omnikar committed Jun 27, 2024
1 parent 934fe4c commit e29f4c0
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 7 deletions.
29 changes: 24 additions & 5 deletions src/lex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ use unicode_segmentation::UnicodeSegmentation;

use crate::{ast::PlaceholderOp, ArraySwizzle, Inputs, Primitive, StackSwizzle, WILDCARD_CHAR};

/// Subscript digit characters
pub const SUBSCRIPT_NUMS: [char; 10] = ['₀', '₁', '₂', '₃', '₄', '₅', '₆', '₇', '₈', '₉'];

/// Lex a Uiua source file
pub fn lex(
input: &str,
Expand Down Expand Up @@ -793,7 +796,7 @@ impl<'a> Lexer<'a> {
"]" => self.end(CloseBracket, start),
"⟨" => self.end(OpenAngle, start),
"⟩" => self.end(CloseAngle, start),
"_" => self.end(Underscore, start),
"_" if self.peek_char() != Some("_") => self.end(Underscore, start),
"|" => self.end(Bar, start),
";" => self.end(Semicolon, start),
"-" if self.next_chars_exact(["-", "-"]) => self.end(TripleMinus, start),
Expand Down Expand Up @@ -1002,12 +1005,28 @@ impl<'a> Lexer<'a> {
}
}
// Identifiers and unformatted glyphs
c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" => {
c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" || c == "_" => {
let mut ident = c.to_string();
// Collect characters
if !is_custom_glyph(c) {
while let Some(c) = self.next_char_if_all(is_ident_char) {
ident.push_str(c);
// Handle identifiers beginning with __
if c == "_" && self.next_char_exact("_") {
ident.push_str("__");
while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) {
ident.push_str(dc);
}
}
loop {
if let Some(c) = self.next_char_if_all(is_ident_char) {
ident.push_str(c);
} else if self.next_chars_exact(["_"; 2]) {
ident.push_str("__");
while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) {
ident.push_str(dc);
}
} else {
break;
}
}
}
let mut exclam_count = 0;
Expand Down Expand Up @@ -1376,7 +1395,7 @@ fn parse_format_fragments(s: &str) -> Vec<String> {

/// Whether a character can be part of a Uiua identifier
pub fn is_ident_char(c: char) -> bool {
c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c)
c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c) || SUBSCRIPT_NUMS.contains(&c)
}

/// Whether a string is a custom glyph
Expand Down
22 changes: 20 additions & 2 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,7 @@ impl<'i> Parser<'i> {
Token::Ident if line.is_some() => {
let line = line.as_mut().unwrap();
let ident = canonicalize_exclams(&self.input[span.byte_range()]);
let ident = canonicalize_subscripts(&ident);
let name = span.clone().sp(ident);
line.items.push(name);
}
Expand Down Expand Up @@ -523,8 +524,9 @@ impl<'i> Parser<'i> {
}
fn try_ident(&mut self) -> Option<Sp<Ident>> {
let span = self.try_exact(Token::Ident)?;
let s: Ident = canonicalize_exclams(&self.input[span.byte_range()]);
Some(span.sp(s))
let ident = canonicalize_exclams(&self.input[span.byte_range()]);
let ident = canonicalize_subscripts(&ident);
Some(span.sp(ident))
}
fn try_ref(&mut self) -> Option<Sp<Word>> {
let mut checkpoint = self.index;
Expand Down Expand Up @@ -1353,6 +1355,22 @@ pub fn canonicalize_exclams(ident: &str) -> Ident {
place_exclams(ident, num_margs)
}

pub fn canonicalize_subscripts(ident: &str) -> Ident {
// This hasty canonicalization is okay because the stricter
// rules about the syntax are handled in the lexer
ident
.chars()
.filter(|c| *c != '_')
.map(|c| {
if let Some(d) = c.to_digit(10) {
crate::lex::SUBSCRIPT_NUMS[d as usize]
} else {
c
}
})
.collect()
}

pub(crate) fn count_placeholders(words: &[Sp<Word>]) -> usize {
let mut count = 0;
for word in words {
Expand Down

0 comments on commit e29f4c0

Please sign in to comment.