From cee7ee1eedd2cb37009db679de2711df770993e9 Mon Sep 17 00:00:00 2001 From: Harman-Aalto Date: Mon, 27 Jan 2025 12:30:35 +0200 Subject: [PATCH] Added tokenizers: Scala3, C, C++, MATLAB -Added Pygments library to requirements -Pygments library can be used to add even more tokenizers for the future -Checksum of submitted source code is now calculated by ignoring the injected separator string in the first and last line of the file -New tokenizers for languages: Scala3, C, C++, MATLAB #30 -Scala tokenizer depends on the outdated library Scalariform #33 --- provider/insert.py | 13 ++++- radar/settings.py | 6 ++ requirements.txt | 3 +- tokenizer/c.py | 9 +++ tokenizer/cpp.py | 9 +++ tokenizer/matlab.py | 9 +++ tokenizer/pygments_lib/__init__.py | 0 tokenizer/pygments_lib/example.py | 21 +++++++ tokenizer/pygments_lib/helper.py | 67 +++++++++++++++++++++++ tokenizer/pygments_lib/token_type.py | 82 ++++++++++++++++++++++++++++ tokenizer/scala.py | 43 ++------------- tokenizer/scala.py_old.txt | 40 ++++++++++++++ 12 files changed, 263 insertions(+), 39 deletions(-) create mode 100644 tokenizer/c.py create mode 100644 tokenizer/cpp.py create mode 100644 tokenizer/matlab.py create mode 100644 tokenizer/pygments_lib/__init__.py create mode 100644 tokenizer/pygments_lib/example.py create mode 100644 tokenizer/pygments_lib/helper.py create mode 100644 tokenizer/pygments_lib/token_type.py create mode 100644 tokenizer/scala.py_old.txt diff --git a/provider/insert.py b/provider/insert.py index 11381ca..5b88a2c 100644 --- a/provider/insert.py +++ b/provider/insert.py @@ -28,6 +28,16 @@ def insert_submission(exercise, submission_key, submitter_id, data=None): ) +# Function to remove first and last comment from the submission text if it is a comment by Radar +def remove_first_and_last_comment(text: str) -> str: + lines = text.splitlines() + if lines[0].startswith(('######', '/******', '"}, "css": {"tokenize": "tokenizer.css.tokenize", "separator": "/****** %s ******/"}, + "c": {"tokenize": "tokenizer.c.tokenize", "separator": "/****** %s ******/"}, + "cpp": {"tokenize": "tokenizer.cpp.tokenize", "separator": "/****** %s ******/"}, + "matlab": {"tokenize": "tokenizer.matlab.tokenize", "separator": "%%%%%%%%%%%% %s %%%%%%%%%%%%"}, } PROVIDER_CHOICES = (("a+", "A+"), ("filesystem", "File system")) diff --git a/requirements.txt b/requirements.txt index b9969da..25b8383 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,6 +12,7 @@ setuptools ~= 68.1.2 django-debug-toolbar ~= 4.4.6 pytz ~= 2024.2 pylibmc ~= 1.6.3 -pytest-playwright ~= 0.6.2 +pytest-playwright ~= 0.6.2 pytest-django ~= 4.9.0 pytest-env ~= 1.1.5 +Pygments ~= 2.19.1 diff --git a/tokenizer/c.py b/tokenizer/c.py new file mode 100644 index 0000000..cd91f81 --- /dev/null +++ b/tokenizer/c.py @@ -0,0 +1,9 @@ +from pygments_lib import helper +from pygments.lexers.c_cpp import CFamilyLexer + +def tokenize(source: str, config=None) -> tuple[str, list]: + """ + Tokenizes C code by replacing all token strings with a single character. + Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string. + """ + return helper.tokenize_code(source, lexer=CFamilyLexer()) diff --git a/tokenizer/cpp.py b/tokenizer/cpp.py new file mode 100644 index 0000000..d8fced9 --- /dev/null +++ b/tokenizer/cpp.py @@ -0,0 +1,9 @@ +from pygments_lib import helper +from pygments.lexers.c_cpp import CppLexer + +def tokenize(source: str, config=None) -> tuple[str, list]: + """ + Tokenizes C++ code by replacing all token strings with a single character. + Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string. + """ + return helper.tokenize_code(source, lexer=CppLexer()) diff --git a/tokenizer/matlab.py b/tokenizer/matlab.py new file mode 100644 index 0000000..a0a58bf --- /dev/null +++ b/tokenizer/matlab.py @@ -0,0 +1,9 @@ +from pygments_lib import helper +from pygments.lexers.matlab import MatlabLexer + +def tokenize(source: str, config=None) -> tuple[str, list]: + """ + Tokenizes MATLAB code by replacing all token strings with a single character. + Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string. + """ + return helper.tokenize_code(source, lexer=MatlabLexer()) diff --git a/tokenizer/pygments_lib/__init__.py b/tokenizer/pygments_lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tokenizer/pygments_lib/example.py b/tokenizer/pygments_lib/example.py new file mode 100644 index 0000000..166cb0c --- /dev/null +++ b/tokenizer/pygments_lib/example.py @@ -0,0 +1,21 @@ +from pygments.lexers.c_cpp import CFamilyLexer + +src = '''#include +#include + +void print_random_number() { + printf("Random number: %d\n", rand()); +} + +int main() { + print_random_number(); + return 0; +}''' + +#Get tokens +lexer = CFamilyLexer() +tokens = lexer.get_tokens_unprocessed(src) + +#Print tokens +for token in tokens: + print(str(token[1]).replace('.', '_').upper(), token[2], f'[{token[0], token[0] + len(token[2])}]') diff --git a/tokenizer/pygments_lib/helper.py b/tokenizer/pygments_lib/helper.py new file mode 100644 index 0000000..f396e6b --- /dev/null +++ b/tokenizer/pygments_lib/helper.py @@ -0,0 +1,67 @@ +import logging +import tokenizer.pygments_lib.token_type as token_type +from pygments.lexer import RegexLexer + +logger = logging.getLogger("radar.tokenizer") + +# Token types which are dropped from the tokenized string +SKIP_TOKENS ={ + 'TOKEN_COMMENT', + 'TOKEN_COMMENT_HASHBANG', + 'TOKEN_COMMENT_MULTILINE', + 'TOKEN_COMMENT_SINGLE', + 'TOKEN_COMMENT_SPECIAL', + + 'TOKEN_TEXT_WHITESPACE', +} + + +def token_type_to_chr(token_type: int) -> str: + """ + Returns a single character representation of the given token type. + Starts from '!'. + """ + return chr(token_type + 33) + + +def tokenize_code(source: str, lexer: RegexLexer) -> tuple[str, list]: + """ + Tokenizes code based on the lexer given by replacing all token strings with a single character. + Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string. + """ + + indexes = [] + # utf-8 is default for the str-type + tokenized_source = "" + + # Get tokens from the lexer + tokens = lexer.get_tokens_unprocessed(source) + + # Process tokens + for token in tokens: + # Convert token type to the name of the token type constant + token_type_clean = str(token[1]).replace('.', '_').upper() + + # Skip tokens that do not change the semantics of the code + if token_type_clean not in SKIP_TOKENS: + try: + token_type_value = token_type.__dict__[token_type_clean] + + # Check for lexer error + if token_type_value == 4: + raise Exception("Token type is not supported") + + # Convert token type to a single character + tokenized_source += token_type_to_chr(token_type_value) + + except KeyError: + logger.error('Unknown token type: %s', token_type_clean) + raise + except Exception as e: + logger.error('Error tokenizing source code: %s', e) + raise + + # Save the start and end index of the token + indexes.append([token[0], token[0] + len(token[2])]) + + return tokenized_source, indexes diff --git a/tokenizer/pygments_lib/token_type.py b/tokenizer/pygments_lib/token_type.py new file mode 100644 index 0000000..5c5af04 --- /dev/null +++ b/tokenizer/pygments_lib/token_type.py @@ -0,0 +1,82 @@ +# Token types constants for the Pygments library. + +TOKEN = 0 +TOKEN_TEXT = 1 +TOKEN_TEXT_WHITESPACE = 2 +TOKEN_ESCAPE = 3 +TOKEN_ERROR = 4 +TOKEN_OTHER = 5 +TOKEN_KEYWORD = 6 +TOKEN_KEYWORD_CONSTANT = 7 +TOKEN_KEYWORD_DECLARATION = 8 +TOKEN_KEYWORD_NAMESPACE = 9 +TOKEN_KEYWORD_PSEUDO = 10 +TOKEN_KEYWORD_RESERVED = 11 +TOKEN_KEYWORD_TYPE = 12 +TOKEN_NAME = 13 +TOKEN_NAME_ATTRIBUTE = 14 +TOKEN_NAME_BUILTIN = 15 +TOKEN_NAME_BUILTIN_PSEUDO = 16 +TOKEN_NAME_CLASS = 17 +TOKEN_NAME_CONSTANT = 18 +TOKEN_NAME_DECORATOR = 19 +TOKEN_NAME_ENTITY = 20 +TOKEN_NAME_EXCEPTION = 21 +TOKEN_NAME_FUNCTION = 22 +TOKEN_NAME_FUNCTION_MAGIC = 23 +TOKEN_NAME_PROPERTY = 24 +TOKEN_NAME_LABEL = 25 +TOKEN_NAME_NAMESPACE = 26 +TOKEN_NAME_OTHER = 27 +TOKEN_NAME_TAG = 28 +TOKEN_NAME_VARIABLE = 29 +TOKEN_NAME_VARIABLE_CLASS = 30 +TOKEN_NAME_VARIABLE_GLOBAL = 31 +TOKEN_NAME_VARIABLE_INSTANCE = 32 +TOKEN_NAME_VARIABLE_MAGIC = 33 +TOKEN_LITERAL = 34 +TOKEN_LITERAL_DATE = 35 +TOKEN_LITERAL_STRING = 36 +TOKEN_LITERAL_STRING_AFFIX = 37 +TOKEN_LITERAL_STRING_BACKTICK = 38 +TOKEN_LITERAL_STRING_CHAR = 39 +TOKEN_LITERAL_STRING_DELIMITER = 40 +TOKEN_LITERAL_STRING_DOC = 41 +TOKEN_LITERAL_STRING_DOUBLE = 42 +TOKEN_LITERAL_STRING_ESCAPE = 43 +TOKEN_LITERAL_STRING_HEREDOC = 44 +TOKEN_LITERAL_STRING_INTERPOL = 45 +TOKEN_LITERAL_STRING_OTHER = 46 +TOKEN_LITERAL_STRING_REGEX = 47 +TOKEN_LITERAL_STRING_SINGLE = 48 +TOKEN_LITERAL_STRING_SYMBOL = 49 +TOKEN_LITERAL_NUMBER = 50 +TOKEN_LITERAL_NUMBER_BIN = 51 +TOKEN_LITERAL_NUMBER_FLOAT = 52 +TOKEN_LITERAL_NUMBER_HEX = 53 +TOKEN_LITERAL_NUMBER_INTEGER = 54 +TOKEN_LITERAL_NUMBER_INTEGER_LONG = 55 +TOKEN_LITERAL_NUMBER_OCT = 56 +TOKEN_OPERATOR = 57 +TOKEN_OPERATOR_WORD = 58 +TOKEN_PUNCTUATION = 59 +TOKEN_PUNCTUATION_MARKER = 60 +TOKEN_COMMENT = 61 +TOKEN_COMMENT_HASHBANG = 62 +TOKEN_COMMENT_MULTILINE = 63 +TOKEN_COMMENT_PREPROC = 64 +TOKEN_COMMENT_PREPROCFILE = 65 +TOKEN_COMMENT_SINGLE = 66 +TOKEN_COMMENT_SPECIAL = 67 +TOKEN_GENERIC = 68 +TOKEN_GENERIC_DELETED = 69 +TOKEN_GENERIC_EMPH = 70 +TOKEN_GENERIC_ERROR = 71 +TOKEN_GENERIC_HEADING = 72 +TOKEN_GENERIC_INSERTED = 73 +TOKEN_GENERIC_OUTPUT = 74 +TOKEN_GENERIC_PROMPT = 75 +TOKEN_GENERIC_STRONG = 76 +TOKEN_GENERIC_SUBHEADING = 77 +TOKEN_GENERIC_EMPHSTRONG = 78 +TOKEN_GENERIC_TRACEBACK = 79 diff --git a/tokenizer/scala.py b/tokenizer/scala.py index 5580886..1828772 100644 --- a/tokenizer/scala.py +++ b/tokenizer/scala.py @@ -1,40 +1,9 @@ -import logging +from pygments_lib import helper +from pygments.lexers.jvm import ScalaLexer -from tokenizer.util import run - -logger = logging.getLogger("radar.tokenizer") - - -def index_string_to_list(line): - """ - Parses data from a line formatted: -,-,... - - @param line an input line str - @return json serializable list of lists +def tokenize(source: str, config=None) -> tuple[str, list]: """ - return [[int(c) for c in pair.split("-")] for pair in line.split(",")] - - -def tokenize(source, config): - """ - Tokenizes Scala code to a sequence of high level structural tokens - that are independent from names or values. - - Runs a scala subprocess. - + Tokenizes Scala code by replacing all token strings with a single character. + Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string. """ - try: - parsed = run( - ( - "scala", - "-cp", - "tokenizer/scalariform/scalariform.jar", - "ScalariformTokens", - ), - source, - ) - lines = parsed.decode("utf-8").split("\n", 1) - return lines[0].strip(), index_string_to_list(lines[1].strip()) - except Exception as e: - logger.info("Failed to tokenize: %s", e) - return ("", []) + return helper.tokenize_code(source, lexer=ScalaLexer()) diff --git a/tokenizer/scala.py_old.txt b/tokenizer/scala.py_old.txt new file mode 100644 index 0000000..5580886 --- /dev/null +++ b/tokenizer/scala.py_old.txt @@ -0,0 +1,40 @@ +import logging + +from tokenizer.util import run + +logger = logging.getLogger("radar.tokenizer") + + +def index_string_to_list(line): + """ + Parses data from a line formatted: -,-,... + + @param line an input line str + @return json serializable list of lists + """ + return [[int(c) for c in pair.split("-")] for pair in line.split(",")] + + +def tokenize(source, config): + """ + Tokenizes Scala code to a sequence of high level structural tokens + that are independent from names or values. + + Runs a scala subprocess. + + """ + try: + parsed = run( + ( + "scala", + "-cp", + "tokenizer/scalariform/scalariform.jar", + "ScalariformTokens", + ), + source, + ) + lines = parsed.decode("utf-8").split("\n", 1) + return lines[0].strip(), index_string_to_list(lines[1].strip()) + except Exception as e: + logger.info("Failed to tokenize: %s", e) + return ("", [])