-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added tokenizers: Scala3, C, C++, MATLAB
-Added Pygments library to requirements -Pygments library can be used to add even more tokenizers for the future -Checksum of submitted source code is now calculated by ignoring the injected separator string in the first and last line of the file -New tokenizers for languages: Scala3, C, C++, MATLAB #30 -Scala tokenizer depends on the outdated library Scalariform #33
- Loading branch information
1 parent
9e56f8c
commit cee7ee1
Showing
12 changed files
with
263 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from pygments_lib import helper | ||
from pygments.lexers.c_cpp import CFamilyLexer | ||
|
||
def tokenize(source: str, config=None) -> tuple[str, list]: | ||
""" | ||
Tokenizes C code by replacing all token strings with a single character. | ||
Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string. | ||
""" | ||
return helper.tokenize_code(source, lexer=CFamilyLexer()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from pygments_lib import helper | ||
from pygments.lexers.c_cpp import CppLexer | ||
|
||
def tokenize(source: str, config=None) -> tuple[str, list]: | ||
""" | ||
Tokenizes C++ code by replacing all token strings with a single character. | ||
Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string. | ||
""" | ||
return helper.tokenize_code(source, lexer=CppLexer()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from pygments_lib import helper | ||
from pygments.lexers.matlab import MatlabLexer | ||
|
||
def tokenize(source: str, config=None) -> tuple[str, list]: | ||
""" | ||
Tokenizes MATLAB code by replacing all token strings with a single character. | ||
Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string. | ||
""" | ||
return helper.tokenize_code(source, lexer=MatlabLexer()) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from pygments.lexers.c_cpp import CFamilyLexer | ||
|
||
src = '''#include <stdio.h> | ||
#include <string.h> | ||
void print_random_number() { | ||
printf("Random number: %d\n", rand()); | ||
} | ||
int main() { | ||
print_random_number(); | ||
return 0; | ||
}''' | ||
|
||
#Get tokens | ||
lexer = CFamilyLexer() | ||
tokens = lexer.get_tokens_unprocessed(src) | ||
|
||
#Print tokens | ||
for token in tokens: | ||
print(str(token[1]).replace('.', '_').upper(), token[2], f'[{token[0], token[0] + len(token[2])}]') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import logging | ||
import tokenizer.pygments_lib.token_type as token_type | ||
from pygments.lexer import RegexLexer | ||
|
||
logger = logging.getLogger("radar.tokenizer") | ||
|
||
# Token types which are dropped from the tokenized string | ||
SKIP_TOKENS ={ | ||
'TOKEN_COMMENT', | ||
'TOKEN_COMMENT_HASHBANG', | ||
'TOKEN_COMMENT_MULTILINE', | ||
'TOKEN_COMMENT_SINGLE', | ||
'TOKEN_COMMENT_SPECIAL', | ||
|
||
'TOKEN_TEXT_WHITESPACE', | ||
} | ||
|
||
|
||
def token_type_to_chr(token_type: int) -> str: | ||
""" | ||
Returns a single character representation of the given token type. | ||
Starts from '!'. | ||
""" | ||
return chr(token_type + 33) | ||
|
||
|
||
def tokenize_code(source: str, lexer: RegexLexer) -> tuple[str, list]: | ||
""" | ||
Tokenizes code based on the lexer given by replacing all token strings with a single character. | ||
Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string. | ||
""" | ||
|
||
indexes = [] | ||
# utf-8 is default for the str-type | ||
tokenized_source = "" | ||
|
||
# Get tokens from the lexer | ||
tokens = lexer.get_tokens_unprocessed(source) | ||
|
||
# Process tokens | ||
for token in tokens: | ||
# Convert token type to the name of the token type constant | ||
token_type_clean = str(token[1]).replace('.', '_').upper() | ||
|
||
# Skip tokens that do not change the semantics of the code | ||
if token_type_clean not in SKIP_TOKENS: | ||
try: | ||
token_type_value = token_type.__dict__[token_type_clean] | ||
|
||
# Check for lexer error | ||
if token_type_value == 4: | ||
raise Exception("Token type is not supported") | ||
|
||
# Convert token type to a single character | ||
tokenized_source += token_type_to_chr(token_type_value) | ||
|
||
except KeyError: | ||
logger.error('Unknown token type: %s', token_type_clean) | ||
raise | ||
except Exception as e: | ||
logger.error('Error tokenizing source code: %s', e) | ||
raise | ||
|
||
# Save the start and end index of the token | ||
indexes.append([token[0], token[0] + len(token[2])]) | ||
|
||
return tokenized_source, indexes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
# Token types constants for the Pygments library. | ||
|
||
TOKEN = 0 | ||
TOKEN_TEXT = 1 | ||
TOKEN_TEXT_WHITESPACE = 2 | ||
TOKEN_ESCAPE = 3 | ||
TOKEN_ERROR = 4 | ||
TOKEN_OTHER = 5 | ||
TOKEN_KEYWORD = 6 | ||
TOKEN_KEYWORD_CONSTANT = 7 | ||
TOKEN_KEYWORD_DECLARATION = 8 | ||
TOKEN_KEYWORD_NAMESPACE = 9 | ||
TOKEN_KEYWORD_PSEUDO = 10 | ||
TOKEN_KEYWORD_RESERVED = 11 | ||
TOKEN_KEYWORD_TYPE = 12 | ||
TOKEN_NAME = 13 | ||
TOKEN_NAME_ATTRIBUTE = 14 | ||
TOKEN_NAME_BUILTIN = 15 | ||
TOKEN_NAME_BUILTIN_PSEUDO = 16 | ||
TOKEN_NAME_CLASS = 17 | ||
TOKEN_NAME_CONSTANT = 18 | ||
TOKEN_NAME_DECORATOR = 19 | ||
TOKEN_NAME_ENTITY = 20 | ||
TOKEN_NAME_EXCEPTION = 21 | ||
TOKEN_NAME_FUNCTION = 22 | ||
TOKEN_NAME_FUNCTION_MAGIC = 23 | ||
TOKEN_NAME_PROPERTY = 24 | ||
TOKEN_NAME_LABEL = 25 | ||
TOKEN_NAME_NAMESPACE = 26 | ||
TOKEN_NAME_OTHER = 27 | ||
TOKEN_NAME_TAG = 28 | ||
TOKEN_NAME_VARIABLE = 29 | ||
TOKEN_NAME_VARIABLE_CLASS = 30 | ||
TOKEN_NAME_VARIABLE_GLOBAL = 31 | ||
TOKEN_NAME_VARIABLE_INSTANCE = 32 | ||
TOKEN_NAME_VARIABLE_MAGIC = 33 | ||
TOKEN_LITERAL = 34 | ||
TOKEN_LITERAL_DATE = 35 | ||
TOKEN_LITERAL_STRING = 36 | ||
TOKEN_LITERAL_STRING_AFFIX = 37 | ||
TOKEN_LITERAL_STRING_BACKTICK = 38 | ||
TOKEN_LITERAL_STRING_CHAR = 39 | ||
TOKEN_LITERAL_STRING_DELIMITER = 40 | ||
TOKEN_LITERAL_STRING_DOC = 41 | ||
TOKEN_LITERAL_STRING_DOUBLE = 42 | ||
TOKEN_LITERAL_STRING_ESCAPE = 43 | ||
TOKEN_LITERAL_STRING_HEREDOC = 44 | ||
TOKEN_LITERAL_STRING_INTERPOL = 45 | ||
TOKEN_LITERAL_STRING_OTHER = 46 | ||
TOKEN_LITERAL_STRING_REGEX = 47 | ||
TOKEN_LITERAL_STRING_SINGLE = 48 | ||
TOKEN_LITERAL_STRING_SYMBOL = 49 | ||
TOKEN_LITERAL_NUMBER = 50 | ||
TOKEN_LITERAL_NUMBER_BIN = 51 | ||
TOKEN_LITERAL_NUMBER_FLOAT = 52 | ||
TOKEN_LITERAL_NUMBER_HEX = 53 | ||
TOKEN_LITERAL_NUMBER_INTEGER = 54 | ||
TOKEN_LITERAL_NUMBER_INTEGER_LONG = 55 | ||
TOKEN_LITERAL_NUMBER_OCT = 56 | ||
TOKEN_OPERATOR = 57 | ||
TOKEN_OPERATOR_WORD = 58 | ||
TOKEN_PUNCTUATION = 59 | ||
TOKEN_PUNCTUATION_MARKER = 60 | ||
TOKEN_COMMENT = 61 | ||
TOKEN_COMMENT_HASHBANG = 62 | ||
TOKEN_COMMENT_MULTILINE = 63 | ||
TOKEN_COMMENT_PREPROC = 64 | ||
TOKEN_COMMENT_PREPROCFILE = 65 | ||
TOKEN_COMMENT_SINGLE = 66 | ||
TOKEN_COMMENT_SPECIAL = 67 | ||
TOKEN_GENERIC = 68 | ||
TOKEN_GENERIC_DELETED = 69 | ||
TOKEN_GENERIC_EMPH = 70 | ||
TOKEN_GENERIC_ERROR = 71 | ||
TOKEN_GENERIC_HEADING = 72 | ||
TOKEN_GENERIC_INSERTED = 73 | ||
TOKEN_GENERIC_OUTPUT = 74 | ||
TOKEN_GENERIC_PROMPT = 75 | ||
TOKEN_GENERIC_STRONG = 76 | ||
TOKEN_GENERIC_SUBHEADING = 77 | ||
TOKEN_GENERIC_EMPHSTRONG = 78 | ||
TOKEN_GENERIC_TRACEBACK = 79 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +1,9 @@ | ||
import logging | ||
from pygments_lib import helper | ||
from pygments.lexers.jvm import ScalaLexer | ||
|
||
from tokenizer.util import run | ||
|
||
logger = logging.getLogger("radar.tokenizer") | ||
|
||
|
||
def index_string_to_list(line): | ||
""" | ||
Parses data from a line formatted: <b:Int>-<e:Int>,<b:Int>-<e:Int>,... | ||
@param line an input line str | ||
@return json serializable list of lists | ||
def tokenize(source: str, config=None) -> tuple[str, list]: | ||
""" | ||
return [[int(c) for c in pair.split("-")] for pair in line.split(",")] | ||
|
||
|
||
def tokenize(source, config): | ||
""" | ||
Tokenizes Scala code to a sequence of high level structural tokens | ||
that are independent from names or values. | ||
Runs a scala subprocess. | ||
Tokenizes Scala code by replacing all token strings with a single character. | ||
Returns the tokenized string and index mappings (as a JSON string) of the tokens to the original string. | ||
""" | ||
try: | ||
parsed = run( | ||
( | ||
"scala", | ||
"-cp", | ||
"tokenizer/scalariform/scalariform.jar", | ||
"ScalariformTokens", | ||
), | ||
source, | ||
) | ||
lines = parsed.decode("utf-8").split("\n", 1) | ||
return lines[0].strip(), index_string_to_list(lines[1].strip()) | ||
except Exception as e: | ||
logger.info("Failed to tokenize: %s", e) | ||
return ("", []) | ||
return helper.tokenize_code(source, lexer=ScalaLexer()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import logging | ||
|
||
from tokenizer.util import run | ||
|
||
logger = logging.getLogger("radar.tokenizer") | ||
|
||
|
||
def index_string_to_list(line): | ||
""" | ||
Parses data from a line formatted: <b:Int>-<e:Int>,<b:Int>-<e:Int>,... | ||
|
||
@param line an input line str | ||
@return json serializable list of lists | ||
""" | ||
return [[int(c) for c in pair.split("-")] for pair in line.split(",")] | ||
|
||
|
||
def tokenize(source, config): | ||
""" | ||
Tokenizes Scala code to a sequence of high level structural tokens | ||
that are independent from names or values. | ||
|
||
Runs a scala subprocess. | ||
|
||
""" | ||
try: | ||
parsed = run( | ||
( | ||
"scala", | ||
"-cp", | ||
"tokenizer/scalariform/scalariform.jar", | ||
"ScalariformTokens", | ||
), | ||
source, | ||
) | ||
lines = parsed.decode("utf-8").split("\n", 1) | ||
return lines[0].strip(), index_string_to_list(lines[1].strip()) | ||
except Exception as e: | ||
logger.info("Failed to tokenize: %s", e) | ||
return ("", []) |