Skip to content

Commit

Permalink
Added tokenizers: Scala3, C, C++, MATLAB
Browse files Browse the repository at this point in the history
-Added Pygments library to requirements
-Pygments library can be used to add even more tokenizers for the future
-Checksum of submitted source code is now calculated by ignoring the injected separator string in the first and last line of the file
-New tokenizers for languages: Scala3, C, C++, MATLAB #30
-Scala tokenizer depends on the outdated library Scalariform #33
  • Loading branch information
Harman-Aalto committed Jan 27, 2025
1 parent 9e56f8c commit cee7ee1
Show file tree
Hide file tree
Showing 12 changed files with 263 additions and 39 deletions.
13 changes: 12 additions & 1 deletion provider/insert.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@ def insert_submission(exercise, submission_key, submitter_id, data=None):
)


# Function to remove first and last comment from the submission text if it is a comment by Radar
def remove_first_and_last_comment(text: str) -> str:
lines = text.splitlines()
if lines[0].startswith(('######', '/******', '<!--', '%%%%%%%%%%%%')):
lines = lines[1:]
if lines[-1].startswith(('######', '/******', '<!--', '%%%%%%%%%%%%')):
lines = lines[:-1]
return '\n'.join(lines)


def prepare_submission(submission, matching_start_time=''):

if matching_start_time:
Expand Down Expand Up @@ -64,7 +74,8 @@ def prepare_submission(submission, matching_start_time=''):
# Compute checksum of submitted source code for finding exact character matches quickly
# This line will not be reached if submission_text contains data not encodable in utf-8,
# since it is checked in tokenizer.tokenize_submission
submission_hash = hashlib.md5(submission_text.encode("utf-8"))
submission_text_without_newlines = remove_first_and_last_comment(submission_text)
submission_hash = hashlib.md5(submission_text_without_newlines.encode("utf-8"))
submission.source_checksum = submission_hash.hexdigest()
submission.save()

Expand Down
6 changes: 6 additions & 0 deletions radar/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@
("js", "JavaScript (ECMA 2016)"),
("html", "HTML5"),
("css", "CSS"),
("c", "C"),
("cpp", "C++"),
("matlab", "MATLAB"),
)
# Tokenizer functions and the separator string injected into the first line
# of each file.
Expand All @@ -123,6 +126,9 @@
},
"html": {"tokenize": "tokenizer.html.tokenize", "separator": "<!-- %s -->"},
"css": {"tokenize": "tokenizer.css.tokenize", "separator": "/****** %s ******/"},
"c": {"tokenize": "tokenizer.c.tokenize", "separator": "/****** %s ******/"},
"cpp": {"tokenize": "tokenizer.cpp.tokenize", "separator": "/****** %s ******/"},
"matlab": {"tokenize": "tokenizer.matlab.tokenize", "separator": "%%%%%%%%%%%% %s %%%%%%%%%%%%"},
}

PROVIDER_CHOICES = (("a+", "A+"), ("filesystem", "File system"))
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ setuptools ~= 68.1.2
django-debug-toolbar ~= 4.4.6
pytz ~= 2024.2
pylibmc ~= 1.6.3
pytest-playwright ~= 0.6.2
pytest-playwright ~= 0.6.2
pytest-django ~= 4.9.0
pytest-env ~= 1.1.5
Pygments ~= 2.19.1
9 changes: 9 additions & 0 deletions tokenizer/c.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pygments_lib import helper
from pygments.lexers.c_cpp import CFamilyLexer

def tokenize(source: str, config=None) -> tuple[str, list]:
"""
Tokenizes C code by replacing all token strings with a single character.
Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string.
"""
return helper.tokenize_code(source, lexer=CFamilyLexer())
9 changes: 9 additions & 0 deletions tokenizer/cpp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pygments_lib import helper
from pygments.lexers.c_cpp import CppLexer

def tokenize(source: str, config=None) -> tuple[str, list]:
"""
Tokenizes C++ code by replacing all token strings with a single character.
Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string.
"""
return helper.tokenize_code(source, lexer=CppLexer())
9 changes: 9 additions & 0 deletions tokenizer/matlab.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pygments_lib import helper
from pygments.lexers.matlab import MatlabLexer

def tokenize(source: str, config=None) -> tuple[str, list]:
"""
Tokenizes MATLAB code by replacing all token strings with a single character.
Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string.
"""
return helper.tokenize_code(source, lexer=MatlabLexer())
Empty file.
21 changes: 21 additions & 0 deletions tokenizer/pygments_lib/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from pygments.lexers.c_cpp import CFamilyLexer

src = '''#include <stdio.h>
#include <string.h>
void print_random_number() {
printf("Random number: %d\n", rand());
}
int main() {
print_random_number();
return 0;
}'''

#Get tokens
lexer = CFamilyLexer()
tokens = lexer.get_tokens_unprocessed(src)

#Print tokens
for token in tokens:
print(str(token[1]).replace('.', '_').upper(), token[2], f'[{token[0], token[0] + len(token[2])}]')
67 changes: 67 additions & 0 deletions tokenizer/pygments_lib/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import logging
import tokenizer.pygments_lib.token_type as token_type
from pygments.lexer import RegexLexer

logger = logging.getLogger("radar.tokenizer")

# Token types which are dropped from the tokenized string
SKIP_TOKENS ={
'TOKEN_COMMENT',
'TOKEN_COMMENT_HASHBANG',
'TOKEN_COMMENT_MULTILINE',
'TOKEN_COMMENT_SINGLE',
'TOKEN_COMMENT_SPECIAL',

'TOKEN_TEXT_WHITESPACE',
}


def token_type_to_chr(token_type: int) -> str:
"""
Returns a single character representation of the given token type.
Starts from '!'.
"""
return chr(token_type + 33)


def tokenize_code(source: str, lexer: RegexLexer) -> tuple[str, list]:
"""
Tokenizes code based on the lexer given by replacing all token strings with a single character.
Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string.
"""

indexes = []
# utf-8 is default for the str-type
tokenized_source = ""

# Get tokens from the lexer
tokens = lexer.get_tokens_unprocessed(source)

# Process tokens
for token in tokens:
# Convert token type to the name of the token type constant
token_type_clean = str(token[1]).replace('.', '_').upper()

# Skip tokens that do not change the semantics of the code
if token_type_clean not in SKIP_TOKENS:
try:
token_type_value = token_type.__dict__[token_type_clean]

# Check for lexer error
if token_type_value == 4:
raise Exception("Token type is not supported")

# Convert token type to a single character
tokenized_source += token_type_to_chr(token_type_value)

except KeyError:
logger.error('Unknown token type: %s', token_type_clean)
raise
except Exception as e:
logger.error('Error tokenizing source code: %s', e)
raise

# Save the start and end index of the token
indexes.append([token[0], token[0] + len(token[2])])

return tokenized_source, indexes
82 changes: 82 additions & 0 deletions tokenizer/pygments_lib/token_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Token types constants for the Pygments library.

TOKEN = 0
TOKEN_TEXT = 1
TOKEN_TEXT_WHITESPACE = 2
TOKEN_ESCAPE = 3
TOKEN_ERROR = 4
TOKEN_OTHER = 5
TOKEN_KEYWORD = 6
TOKEN_KEYWORD_CONSTANT = 7
TOKEN_KEYWORD_DECLARATION = 8
TOKEN_KEYWORD_NAMESPACE = 9
TOKEN_KEYWORD_PSEUDO = 10
TOKEN_KEYWORD_RESERVED = 11
TOKEN_KEYWORD_TYPE = 12
TOKEN_NAME = 13
TOKEN_NAME_ATTRIBUTE = 14
TOKEN_NAME_BUILTIN = 15
TOKEN_NAME_BUILTIN_PSEUDO = 16
TOKEN_NAME_CLASS = 17
TOKEN_NAME_CONSTANT = 18
TOKEN_NAME_DECORATOR = 19
TOKEN_NAME_ENTITY = 20
TOKEN_NAME_EXCEPTION = 21
TOKEN_NAME_FUNCTION = 22
TOKEN_NAME_FUNCTION_MAGIC = 23
TOKEN_NAME_PROPERTY = 24
TOKEN_NAME_LABEL = 25
TOKEN_NAME_NAMESPACE = 26
TOKEN_NAME_OTHER = 27
TOKEN_NAME_TAG = 28
TOKEN_NAME_VARIABLE = 29
TOKEN_NAME_VARIABLE_CLASS = 30
TOKEN_NAME_VARIABLE_GLOBAL = 31
TOKEN_NAME_VARIABLE_INSTANCE = 32
TOKEN_NAME_VARIABLE_MAGIC = 33
TOKEN_LITERAL = 34
TOKEN_LITERAL_DATE = 35
TOKEN_LITERAL_STRING = 36
TOKEN_LITERAL_STRING_AFFIX = 37
TOKEN_LITERAL_STRING_BACKTICK = 38
TOKEN_LITERAL_STRING_CHAR = 39
TOKEN_LITERAL_STRING_DELIMITER = 40
TOKEN_LITERAL_STRING_DOC = 41
TOKEN_LITERAL_STRING_DOUBLE = 42
TOKEN_LITERAL_STRING_ESCAPE = 43
TOKEN_LITERAL_STRING_HEREDOC = 44
TOKEN_LITERAL_STRING_INTERPOL = 45
TOKEN_LITERAL_STRING_OTHER = 46
TOKEN_LITERAL_STRING_REGEX = 47
TOKEN_LITERAL_STRING_SINGLE = 48
TOKEN_LITERAL_STRING_SYMBOL = 49
TOKEN_LITERAL_NUMBER = 50
TOKEN_LITERAL_NUMBER_BIN = 51
TOKEN_LITERAL_NUMBER_FLOAT = 52
TOKEN_LITERAL_NUMBER_HEX = 53
TOKEN_LITERAL_NUMBER_INTEGER = 54
TOKEN_LITERAL_NUMBER_INTEGER_LONG = 55
TOKEN_LITERAL_NUMBER_OCT = 56
TOKEN_OPERATOR = 57
TOKEN_OPERATOR_WORD = 58
TOKEN_PUNCTUATION = 59
TOKEN_PUNCTUATION_MARKER = 60
TOKEN_COMMENT = 61
TOKEN_COMMENT_HASHBANG = 62
TOKEN_COMMENT_MULTILINE = 63
TOKEN_COMMENT_PREPROC = 64
TOKEN_COMMENT_PREPROCFILE = 65
TOKEN_COMMENT_SINGLE = 66
TOKEN_COMMENT_SPECIAL = 67
TOKEN_GENERIC = 68
TOKEN_GENERIC_DELETED = 69
TOKEN_GENERIC_EMPH = 70
TOKEN_GENERIC_ERROR = 71
TOKEN_GENERIC_HEADING = 72
TOKEN_GENERIC_INSERTED = 73
TOKEN_GENERIC_OUTPUT = 74
TOKEN_GENERIC_PROMPT = 75
TOKEN_GENERIC_STRONG = 76
TOKEN_GENERIC_SUBHEADING = 77
TOKEN_GENERIC_EMPHSTRONG = 78
TOKEN_GENERIC_TRACEBACK = 79
43 changes: 6 additions & 37 deletions tokenizer/scala.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,9 @@
import logging
from pygments_lib import helper
from pygments.lexers.jvm import ScalaLexer

from tokenizer.util import run

logger = logging.getLogger("radar.tokenizer")


def index_string_to_list(line):
"""
Parses data from a line formatted: <b:Int>-<e:Int>,<b:Int>-<e:Int>,...
@param line an input line str
@return json serializable list of lists
def tokenize(source: str, config=None) -> tuple[str, list]:
"""
return [[int(c) for c in pair.split("-")] for pair in line.split(",")]


def tokenize(source, config):
"""
Tokenizes Scala code to a sequence of high level structural tokens
that are independent from names or values.
Runs a scala subprocess.
Tokenizes Scala code by replacing all token strings with a single character.
Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string.
"""
try:
parsed = run(
(
"scala",
"-cp",
"tokenizer/scalariform/scalariform.jar",
"ScalariformTokens",
),
source,
)
lines = parsed.decode("utf-8").split("\n", 1)
return lines[0].strip(), index_string_to_list(lines[1].strip())
except Exception as e:
logger.info("Failed to tokenize: %s", e)
return ("", [])
return helper.tokenize_code(source, lexer=ScalaLexer())
40 changes: 40 additions & 0 deletions tokenizer/scala.py_old.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import logging

from tokenizer.util import run

logger = logging.getLogger("radar.tokenizer")


def index_string_to_list(line):
"""
Parses data from a line formatted: <b:Int>-<e:Int>,<b:Int>-<e:Int>,...

@param line an input line str
@return json serializable list of lists
"""
return [[int(c) for c in pair.split("-")] for pair in line.split(",")]


def tokenize(source, config):
"""
Tokenizes Scala code to a sequence of high level structural tokens
that are independent from names or values.

Runs a scala subprocess.

"""
try:
parsed = run(
(
"scala",
"-cp",
"tokenizer/scalariform/scalariform.jar",
"ScalariformTokens",
),
source,
)
lines = parsed.decode("utf-8").split("\n", 1)
return lines[0].strip(), index_string_to_list(lines[1].strip())
except Exception as e:
logger.info("Failed to tokenize: %s", e)
return ("", [])

0 comments on commit cee7ee1

Please sign in to comment.