Skip to content

Commit

Permalink
Merge pull request #516 from Omnikar/ident-subscripts
Browse files Browse the repository at this point in the history
Allow subscript numbers in identifiers
  • Loading branch information
kaikalii committed Jun 27, 2024
2 parents 934fe4c + 4e67e85 commit cf41285
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 10 deletions.
2 changes: 1 addition & 1 deletion src/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ impl<'a> Formatter<'a> {
}

self.output
.push_str(&crate::parse::canonicalize_exclams(&binding.name.value));
.push_str(&crate::parse::canonicalize_ident(&binding.name.value));
self.output
.push_str(if binding.public { " ←" } else { " ↚" });
if binding.array_macro {
Expand Down
29 changes: 24 additions & 5 deletions src/lex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ use unicode_segmentation::UnicodeSegmentation;

use crate::{ast::PlaceholderOp, ArraySwizzle, Inputs, Primitive, StackSwizzle, WILDCARD_CHAR};

/// Subscript digit characters
pub const SUBSCRIPT_NUMS: [char; 10] = ['₀', '₁', '₂', '₃', '₄', '₅', '₆', '₇', '₈', '₉'];

/// Lex a Uiua source file
pub fn lex(
input: &str,
Expand Down Expand Up @@ -793,7 +796,7 @@ impl<'a> Lexer<'a> {
"]" => self.end(CloseBracket, start),
"⟨" => self.end(OpenAngle, start),
"⟩" => self.end(CloseAngle, start),
"_" => self.end(Underscore, start),
"_" if self.peek_char() != Some("_") => self.end(Underscore, start),
"|" => self.end(Bar, start),
";" => self.end(Semicolon, start),
"-" if self.next_chars_exact(["-", "-"]) => self.end(TripleMinus, start),
Expand Down Expand Up @@ -1002,12 +1005,28 @@ impl<'a> Lexer<'a> {
}
}
// Identifiers and unformatted glyphs
c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" => {
c if is_custom_glyph(c) || c.chars().all(is_ident_char) || c == "&" || c == "_" => {
let mut ident = c.to_string();
// Collect characters
if !is_custom_glyph(c) {
while let Some(c) = self.next_char_if_all(is_ident_char) {
ident.push_str(c);
// Handle identifiers beginning with __
if c == "_" && self.next_char_exact("_") {
ident.push_str("__");
while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) {
ident.push_str(dc);
}
}
loop {
if let Some(c) = self.next_char_if_all(is_ident_char) {
ident.push_str(c);
} else if self.next_chars_exact(["_"; 2]) {
ident.push_str("__");
while let Some(dc) = self.next_char_if_all(|c| c.is_ascii_digit()) {
ident.push_str(dc);
}
} else {
break;
}
}
}
let mut exclam_count = 0;
Expand Down Expand Up @@ -1376,7 +1395,7 @@ fn parse_format_fragments(s: &str) -> Vec<String> {

/// Whether a character can be part of a Uiua identifier
pub fn is_ident_char(c: char) -> bool {
c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c)
c.is_alphabetic() && !"ⁿₙπτηℂλ".contains(c) || SUBSCRIPT_NUMS.contains(&c)
}

/// Whether a string is a custom glyph
Expand Down
29 changes: 25 additions & 4 deletions src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ impl<'i> Parser<'i> {
match token {
Token::Ident if line.is_some() => {
let line = line.as_mut().unwrap();
let ident = canonicalize_exclams(&self.input[span.byte_range()]);
let ident = canonicalize_ident(&self.input[span.byte_range()]);
let name = span.clone().sp(ident);
line.items.push(name);
}
Expand Down Expand Up @@ -523,8 +523,8 @@ impl<'i> Parser<'i> {
}
fn try_ident(&mut self) -> Option<Sp<Ident>> {
let span = self.try_exact(Token::Ident)?;
let s: Ident = canonicalize_exclams(&self.input[span.byte_range()]);
Some(span.sp(s))
let ident = canonicalize_ident(&self.input[span.byte_range()]);
Some(span.sp(ident))
}
fn try_ref(&mut self) -> Option<Sp<Word>> {
let mut checkpoint = self.index;
Expand Down Expand Up @@ -1348,11 +1348,32 @@ pub fn place_exclams(ident: &str, count: usize) -> Ident {
}

/// Rewrite the identifier with the same number of exclamation points using double and single exclamation point characters as needed
pub fn canonicalize_exclams(ident: &str) -> Ident {
fn canonicalize_exclams(ident: &str) -> Ident {
let num_margs = crate::parse::ident_modifier_args(ident);
place_exclams(ident, num_margs)
}

/// Rewrite the identifier with numerals preceded by `__` replaced with subscript characters
fn canonicalize_subscripts(ident: &str) -> Ident {
// This hasty canonicalization is okay because the stricter
// rules about the syntax are handled in the lexer
ident
.chars()
.filter(|c| *c != '_')
.map(|c| {
if let Some(d) = c.to_digit(10) {
crate::lex::SUBSCRIPT_NUMS[d as usize]
} else {
c
}
})
.collect()
}

pub fn canonicalize_ident(ident: &str) -> Ident {
canonicalize_subscripts(&canonicalize_exclams(ident))
}

pub(crate) fn count_placeholders(words: &[Sp<Word>]) -> usize {
let mut count = 0;
for word in words {
Expand Down

0 comments on commit cf41285

Please sign in to comment.