Added tokenizers: Scala3, C, C++, MATLAB

-Added Pygments library to requirements -Pygments library can be used to add even more tokenizers for the future -Checksum of submitted source code is now calculated by ignoring the injected separator string in the first and last line of the file -New tokenizers for languages: Scala3, C, C++, MATLAB #30 -Scala tokenizer depends on the outdated library Scalariform #33
apluslms · Jan 27, 2025 · cee7ee1 · cee7ee1
1 parent 9e56f8c
commit cee7ee1
Show file tree

Hide file tree

Showing 12 changed files with 263 additions and 39 deletions.
diff --git a/provider/insert.py b/provider/insert.py
@@ -28,6 +28,16 @@ def insert_submission(exercise, submission_key, submitter_id, data=None):
     )
 
 
+# Function to remove first and last comment from the submission text if it is a comment by Radar
+def remove_first_and_last_comment(text: str) -> str:
+    lines = text.splitlines()
+    if lines[0].startswith(('######', '/******', '<!--', '%%%%%%%%%%%%')):
+        lines = lines[1:]
+    if lines[-1].startswith(('######', '/******', '<!--', '%%%%%%%%%%%%')):
+        lines = lines[:-1]
+    return '\n'.join(lines)
+
+
 def prepare_submission(submission, matching_start_time=''):
 
     if matching_start_time:
@@ -64,7 +74,8 @@ def prepare_submission(submission, matching_start_time=''):
     # Compute checksum of submitted source code for finding exact character matches quickly
     # This line will not be reached if submission_text contains data not encodable in utf-8,
     # since it is checked in tokenizer.tokenize_submission
-    submission_hash = hashlib.md5(submission_text.encode("utf-8"))
+    submission_text_without_newlines = remove_first_and_last_comment(submission_text)
+    submission_hash = hashlib.md5(submission_text_without_newlines.encode("utf-8"))
     submission.source_checksum = submission_hash.hexdigest()
     submission.save()
 

diff --git a/radar/settings.py b/radar/settings.py
@@ -102,6 +102,9 @@
     ("js", "JavaScript (ECMA 2016)"),
     ("html", "HTML5"),
     ("css", "CSS"),
+    ("c", "C"),
+    ("cpp", "C++"),
+    ("matlab", "MATLAB"),
 )
 # Tokenizer functions and the separator string injected into the first line
 # of each file.
@@ -123,6 +126,9 @@
     },
     "html": {"tokenize": "tokenizer.html.tokenize", "separator": "<!-- %s -->"},
     "css": {"tokenize": "tokenizer.css.tokenize", "separator": "/****** %s ******/"},
+    "c": {"tokenize": "tokenizer.c.tokenize", "separator": "/****** %s ******/"},
+    "cpp": {"tokenize": "tokenizer.cpp.tokenize", "separator": "/****** %s ******/"},
+    "matlab": {"tokenize": "tokenizer.matlab.tokenize", "separator": "%%%%%%%%%%%% %s %%%%%%%%%%%%"},
 }
 
 PROVIDER_CHOICES = (("a+", "A+"), ("filesystem", "File system"))

diff --git a/requirements.txt b/requirements.txt
@@ -12,6 +12,7 @@ setuptools ~= 68.1.2
 django-debug-toolbar ~= 4.4.6
 pytz ~= 2024.2
 pylibmc ~= 1.6.3
-pytest-playwright ~=  0.6.2
+pytest-playwright ~= 0.6.2
 pytest-django ~= 4.9.0
 pytest-env ~= 1.1.5
+Pygments ~= 2.19.1
diff --git a/tokenizer/c.py b/tokenizer/c.py
@@ -0,0 +1,9 @@
+from pygments_lib import helper
+from pygments.lexers.c_cpp import CFamilyLexer
+
+def tokenize(source: str, config=None) -> tuple[str, list]:
+    """
+    Tokenizes C code by replacing all token strings with a single character.
+    Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string.
+    """
+    return helper.tokenize_code(source, lexer=CFamilyLexer())
diff --git a/tokenizer/cpp.py b/tokenizer/cpp.py
@@ -0,0 +1,9 @@
+from pygments_lib import helper
+from pygments.lexers.c_cpp import CppLexer
+
+def tokenize(source: str, config=None) -> tuple[str, list]:
+    """
+    Tokenizes C++ code by replacing all token strings with a single character.
+    Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string.
+    """
+    return helper.tokenize_code(source, lexer=CppLexer())
diff --git a/tokenizer/matlab.py b/tokenizer/matlab.py
@@ -0,0 +1,9 @@
+from pygments_lib import helper
+from pygments.lexers.matlab import MatlabLexer
+
+def tokenize(source: str, config=None) -> tuple[str, list]:
+    """
+    Tokenizes MATLAB code by replacing all token strings with a single character.
+    Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string.
+    """
+    return helper.tokenize_code(source, lexer=MatlabLexer())
diff --git a/tokenizer/pygments_lib/__init__.py b/tokenizer/pygments_lib/__init__.py
diff --git a/tokenizer/pygments_lib/example.py b/tokenizer/pygments_lib/example.py
@@ -0,0 +1,21 @@
+from pygments.lexers.c_cpp import CFamilyLexer
+
+src = '''#include <stdio.h>
+#include <string.h>
+
+void print_random_number() {
+    printf("Random number: %d\n", rand());
+}
+
+int main() {
+    print_random_number();
+    return 0;
+}'''
+
+#Get tokens
+lexer = CFamilyLexer()
+tokens = lexer.get_tokens_unprocessed(src)
+
+#Print tokens
+for token in tokens:
+    print(str(token[1]).replace('.', '_').upper(), token[2], f'[{token[0], token[0] + len(token[2])}]')
diff --git a/tokenizer/pygments_lib/helper.py b/tokenizer/pygments_lib/helper.py
@@ -0,0 +1,67 @@
+import logging
+import tokenizer.pygments_lib.token_type as token_type
+from pygments.lexer import RegexLexer
+
+logger = logging.getLogger("radar.tokenizer")
+
+# Token types which are dropped from the tokenized string
+SKIP_TOKENS ={
+    'TOKEN_COMMENT',
+    'TOKEN_COMMENT_HASHBANG',
+    'TOKEN_COMMENT_MULTILINE',
+    'TOKEN_COMMENT_SINGLE',
+    'TOKEN_COMMENT_SPECIAL',
+
+    'TOKEN_TEXT_WHITESPACE',
+}
+
+
+def token_type_to_chr(token_type: int) -> str:
+    """
+    Returns a single character representation of the given token type.
+    Starts from '!'.
+    """
+    return chr(token_type + 33)
+
+
+def tokenize_code(source: str, lexer: RegexLexer) -> tuple[str, list]:
+    """
+    Tokenizes code based on the lexer given by replacing all token strings with a single character.
+    Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string.
+    """
+
+    indexes = []
+    # utf-8 is default for the str-type
+    tokenized_source = ""
+
+    # Get tokens from the lexer
+    tokens = lexer.get_tokens_unprocessed(source)
+
+    # Process tokens
+    for token in tokens:
+        # Convert token type to the name of the token type constant
+        token_type_clean = str(token[1]).replace('.', '_').upper()
+
+        # Skip tokens that do not change the semantics of the code
+        if token_type_clean not in SKIP_TOKENS:
+            try:
+                token_type_value = token_type.__dict__[token_type_clean]
+
+                # Check for lexer error
+                if token_type_value == 4:
+                    raise Exception("Token type is not supported")
+
+                # Convert token type to a single character
+                tokenized_source += token_type_to_chr(token_type_value)
+
+            except KeyError:
+                logger.error('Unknown token type: %s', token_type_clean)
+                raise
+            except Exception as e:
+                logger.error('Error tokenizing source code: %s', e)
+                raise
+
+            # Save the start and end index of the token
+            indexes.append([token[0], token[0] + len(token[2])])
+
+    return tokenized_source, indexes
diff --git a/tokenizer/pygments_lib/token_type.py b/tokenizer/pygments_lib/token_type.py
@@ -0,0 +1,82 @@
+# Token types constants for the Pygments library.
+
+TOKEN = 0
+TOKEN_TEXT = 1
+TOKEN_TEXT_WHITESPACE = 2
+TOKEN_ESCAPE = 3
+TOKEN_ERROR = 4
+TOKEN_OTHER = 5
+TOKEN_KEYWORD = 6
+TOKEN_KEYWORD_CONSTANT = 7
+TOKEN_KEYWORD_DECLARATION = 8
+TOKEN_KEYWORD_NAMESPACE = 9
+TOKEN_KEYWORD_PSEUDO = 10
+TOKEN_KEYWORD_RESERVED = 11
+TOKEN_KEYWORD_TYPE = 12
+TOKEN_NAME = 13
+TOKEN_NAME_ATTRIBUTE = 14
+TOKEN_NAME_BUILTIN = 15
+TOKEN_NAME_BUILTIN_PSEUDO = 16
+TOKEN_NAME_CLASS = 17
+TOKEN_NAME_CONSTANT = 18
+TOKEN_NAME_DECORATOR = 19
+TOKEN_NAME_ENTITY = 20
+TOKEN_NAME_EXCEPTION = 21
+TOKEN_NAME_FUNCTION = 22
+TOKEN_NAME_FUNCTION_MAGIC = 23
+TOKEN_NAME_PROPERTY = 24
+TOKEN_NAME_LABEL = 25
+TOKEN_NAME_NAMESPACE = 26
+TOKEN_NAME_OTHER = 27
+TOKEN_NAME_TAG = 28
+TOKEN_NAME_VARIABLE = 29
+TOKEN_NAME_VARIABLE_CLASS = 30
+TOKEN_NAME_VARIABLE_GLOBAL = 31
+TOKEN_NAME_VARIABLE_INSTANCE = 32
+TOKEN_NAME_VARIABLE_MAGIC = 33
+TOKEN_LITERAL = 34
+TOKEN_LITERAL_DATE = 35
+TOKEN_LITERAL_STRING = 36
+TOKEN_LITERAL_STRING_AFFIX = 37
+TOKEN_LITERAL_STRING_BACKTICK = 38
+TOKEN_LITERAL_STRING_CHAR = 39
+TOKEN_LITERAL_STRING_DELIMITER = 40
+TOKEN_LITERAL_STRING_DOC = 41
+TOKEN_LITERAL_STRING_DOUBLE = 42
+TOKEN_LITERAL_STRING_ESCAPE = 43
+TOKEN_LITERAL_STRING_HEREDOC = 44
+TOKEN_LITERAL_STRING_INTERPOL = 45
+TOKEN_LITERAL_STRING_OTHER = 46
+TOKEN_LITERAL_STRING_REGEX = 47
+TOKEN_LITERAL_STRING_SINGLE = 48
+TOKEN_LITERAL_STRING_SYMBOL = 49
+TOKEN_LITERAL_NUMBER = 50
+TOKEN_LITERAL_NUMBER_BIN = 51
+TOKEN_LITERAL_NUMBER_FLOAT = 52
+TOKEN_LITERAL_NUMBER_HEX = 53
+TOKEN_LITERAL_NUMBER_INTEGER = 54
+TOKEN_LITERAL_NUMBER_INTEGER_LONG = 55
+TOKEN_LITERAL_NUMBER_OCT = 56
+TOKEN_OPERATOR = 57
+TOKEN_OPERATOR_WORD = 58
+TOKEN_PUNCTUATION = 59
+TOKEN_PUNCTUATION_MARKER = 60
+TOKEN_COMMENT = 61
+TOKEN_COMMENT_HASHBANG = 62
+TOKEN_COMMENT_MULTILINE = 63
+TOKEN_COMMENT_PREPROC = 64
+TOKEN_COMMENT_PREPROCFILE = 65
+TOKEN_COMMENT_SINGLE = 66
+TOKEN_COMMENT_SPECIAL = 67
+TOKEN_GENERIC = 68
+TOKEN_GENERIC_DELETED = 69
+TOKEN_GENERIC_EMPH = 70
+TOKEN_GENERIC_ERROR = 71
+TOKEN_GENERIC_HEADING = 72
+TOKEN_GENERIC_INSERTED = 73
+TOKEN_GENERIC_OUTPUT = 74
+TOKEN_GENERIC_PROMPT = 75
+TOKEN_GENERIC_STRONG = 76
+TOKEN_GENERIC_SUBHEADING = 77
+TOKEN_GENERIC_EMPHSTRONG = 78
+TOKEN_GENERIC_TRACEBACK = 79
diff --git a/tokenizer/scala.py b/tokenizer/scala.py
@@ -1,40 +1,9 @@
-import logging
+from pygments_lib import helper
+from pygments.lexers.jvm import ScalaLexer
 
-from tokenizer.util import run
-
-logger = logging.getLogger("radar.tokenizer")
-
-
-def index_string_to_list(line):
-    """
-    Parses data from a line formatted: <b:Int>-<e:Int>,<b:Int>-<e:Int>,...
-
-    @param line an input line str
-    @return json serializable list of lists
+def tokenize(source: str, config=None) -> tuple[str, list]:
     """
-    return [[int(c) for c in pair.split("-")] for pair in line.split(",")]
-
-
-def tokenize(source, config):
-    """
-    Tokenizes Scala code to a sequence of high level structural tokens
-    that are independent from names or values.
-
-    Runs a scala subprocess.
-
+    Tokenizes Scala code by replacing all token strings with a single character.
+    Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string.
     """
-    try:
-        parsed = run(
-            (
-                "scala",
-                "-cp",
-                "tokenizer/scalariform/scalariform.jar",
-                "ScalariformTokens",
-            ),
-            source,
-        )
-        lines = parsed.decode("utf-8").split("\n", 1)
-        return lines[0].strip(), index_string_to_list(lines[1].strip())
-    except Exception as e:
-        logger.info("Failed to tokenize: %s", e)
-        return ("", [])
+    return helper.tokenize_code(source, lexer=ScalaLexer())
diff --git a/tokenizer/scala.py_old.txt b/tokenizer/scala.py_old.txt
@@ -0,0 +1,40 @@
+import logging
+
+from tokenizer.util import run
+
+logger = logging.getLogger("radar.tokenizer")
+
+
+def index_string_to_list(line):
+    """
+    Parses data from a line formatted: <b:Int>-<e:Int>,<b:Int>-<e:Int>,...
+
+    @param line an input line str
+    @return json serializable list of lists
+    """
+    return [[int(c) for c in pair.split("-")] for pair in line.split(",")]
+
+
+def tokenize(source, config):
+    """
+    Tokenizes Scala code to a sequence of high level structural tokens
+    that are independent from names or values.
+
+    Runs a scala subprocess.
+
+    """
+    try:
+        parsed = run(
+            (
+                "scala",
+                "-cp",
+                "tokenizer/scalariform/scalariform.jar",
+                "ScalariformTokens",
+            ),
+            source,
+        )
+        lines = parsed.decode("utf-8").split("\n", 1)
+        return lines[0].strip(), index_string_to_list(lines[1].strip())
+    except Exception as e:
+        logger.info("Failed to tokenize: %s", e)
+        return ("", [])