From c1ec6c2e091a87ef2f3889fd4d6295d1393d120f Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Sun, 19 Jan 2025 08:22:55 +0100 Subject: [PATCH] Removed non-common tokens from GST --- .../java/de/jplag/GreedyStringTiling.java | 43 +++++++++++++------ core/src/main/java/de/jplag/Submission.java | 4 ++ 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/core/src/main/java/de/jplag/GreedyStringTiling.java b/core/src/main/java/de/jplag/GreedyStringTiling.java index 3cef93354e..dbc248efe2 100644 --- a/core/src/main/java/de/jplag/GreedyStringTiling.java +++ b/core/src/main/java/de/jplag/GreedyStringTiling.java @@ -98,14 +98,29 @@ public final JPlagComparison compare(Submission firstSubmission, Submission seco * @return the comparison results. */ private JPlagComparison compareInternal(Submission leftSubmission, Submission rightSubmission) { - int[] leftValues = tokenValueListFromSubmission(leftSubmission); - int[] rightValues = tokenValueListFromSubmission(rightSubmission); - - boolean[] leftMarked = calculateInitiallyMarked(leftSubmission); - boolean[] rightMarked = calculateInitiallyMarked(rightSubmission); - - SubsequenceHashLookupTable leftLookupTable = subsequenceHashLookupTableForSubmission(leftSubmission, leftMarked); - SubsequenceHashLookupTable rightLookupTable = subsequenceHashLookupTableForSubmission(rightSubmission, rightMarked); + List> contextsLeft = leftSubmission.getTokenList().stream().map(it -> it.getLanguage().getTokenContexts()).reduce((left, right) -> { + List> leftContexts = new ArrayList<>(left); + leftContexts.retainAll(right); + return leftContexts; + }).get(); + List> contextsRight = rightSubmission.getTokenList().stream().map(it -> it.getLanguage().getTokenContexts()) + .reduce((left, right) -> { + List> leftContexts = new ArrayList<>(left); + leftContexts.retainAll(right); + return leftContexts; + }).get(); + + List> contexts = new ArrayList<>(contextsLeft); + contexts.retainAll(contextsRight); + + int[] leftValues = tokenValueListFromSubmission(leftSubmission, contexts); + int[] rightValues = tokenValueListFromSubmission(rightSubmission, contexts); + + boolean[] leftMarked = calculateInitiallyMarked(leftSubmission, contexts); + boolean[] rightMarked = calculateInitiallyMarked(rightSubmission, contexts); + + SubsequenceHashLookupTable leftLookupTable = subsequenceHashLookupTableForSubmission(leftSubmission, leftMarked, contexts); + SubsequenceHashLookupTable rightLookupTable = subsequenceHashLookupTableForSubmission(rightSubmission, rightMarked, contexts); int maximumMatchLength; List globalMatches = new ArrayList<>(); @@ -195,9 +210,9 @@ private void addMatchIfNotOverlapping(List matches, Match match) { matches.add(match); } - private boolean[] calculateInitiallyMarked(Submission submission) { + private boolean[] calculateInitiallyMarked(Submission submission, List> contexts) { Set baseCodeTokens = baseCodeMarkings.get(submission); - List tokens = submission.getTokenList(); + List tokens = submission.getTokenList(contexts); boolean[] result = new boolean[tokens.size()]; for (int i = 0; i < result.length; i++) { result[i] = tokens.get(i).getType().isExcludedFromMatching() || (baseCodeTokens != null && baseCodeTokens.contains(tokens.get(i))); @@ -205,18 +220,18 @@ private boolean[] calculateInitiallyMarked(Submission submission) { return result; } - private SubsequenceHashLookupTable subsequenceHashLookupTableForSubmission(Submission submission, boolean[] marked) { + private SubsequenceHashLookupTable subsequenceHashLookupTableForSubmission(Submission submission, boolean[] marked, List> contexts) { return cachedHashLookupTables.computeIfAbsent(submission, - (key -> new SubsequenceHashLookupTable(minimumMatchLength, tokenValueListFromSubmission(key), marked))); + (key -> new SubsequenceHashLookupTable(minimumMatchLength, tokenValueListFromSubmission(key, contexts), marked))); } /** * Converts the tokens of the submission to a list of values. * @param submission The submission from which to convert the tokens. */ - private int[] tokenValueListFromSubmission(Submission submission) { + private int[] tokenValueListFromSubmission(Submission submission, List> contexts) { return cachedTokenValueLists.computeIfAbsent(submission, (key -> { - List tokens = key.getTokenList(); + List tokens = key.getTokenList(contexts); int[] tokenValueList = new int[tokens.size()]; for (int i = 0; i < tokens.size(); i++) { TokenType type = tokens.get(i).getType(); diff --git a/core/src/main/java/de/jplag/Submission.java b/core/src/main/java/de/jplag/Submission.java index 5610a19873..2995a20963 100644 --- a/core/src/main/java/de/jplag/Submission.java +++ b/core/src/main/java/de/jplag/Submission.java @@ -140,6 +140,10 @@ public List getTokenList() { return tokenList; } + public List getTokenList(List> contexts) { + return tokenList.stream().filter(it -> contexts.contains(it.getType().getContext())).toList(); + } + /** * @return Whether a comparison between the submission and the base code is available. */