From c27b42f590b2d88ad4b953be901971a6db25b9bc Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Wed, 21 Jun 2023 08:44:02 +0200 Subject: [PATCH 01/71] Add String Encoding Benchmark Script --- scripts/evaluate_string_segments.py | 314 ++++++++++++++++++++++++++++ 1 file changed, 314 insertions(+) create mode 100755 scripts/evaluate_string_segments.py diff --git a/scripts/evaluate_string_segments.py b/scripts/evaluate_string_segments.py new file mode 100755 index 0000000000..0b8f850108 --- /dev/null +++ b/scripts/evaluate_string_segments.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python3 + +from abc import ABC, abstractmethod +from argparse import ArgumentParser, BooleanOptionalAction +from dataclasses import dataclass +import multiprocessing +from os import path +from subprocess import check_output +import sys +from typing import Any, Literal + + +DEBUG_MODE = False + +def print_debug(*args, **kwargs) -> None: + if DEBUG_MODE: + print(*args, **kwargs) + + +def print_error(*args, **kwargs) -> None: + print(*args, file=sys.stderr, **kwargs) + + +def output(*args, **kwargs) -> None: + print(*args, file=output_file, **kwargs) + + +@dataclass(frozen=True) +class Configuration: + output_file: str + build_path: str + tmp_path: str + scale_factor: float + encoding_under_test: str + time_limit: int + debug: bool + + +class Benchmark(ABC): + _config: Configuration + + # These encodings are supported by default for string types. + _encodings = [ + 'Unencoded', + 'Dictionary', + 'RunLength', + 'FixedStringDictionary', + 'LZ4' + ] + + @staticmethod + def create(name: str, *args: Any, **kwargs: Any) -> 'Benchmark': + classes = { + 'hyriseBenchmarkTPCH': TPCHBenchmark, + 'hyriseBenchmarkTPCDS': TPCDSBenchmark, + 'hyriseBenchmarkJoinOrder': JoinOrderBenchmark, + 'hyriseBenchmarkStarSchema': StarSchemaBenchmark + } + return classes[name](*args, **kwargs) + + def run(self, config: Configuration) -> None: + try: + self._config = config + output(f'## Benchmark {self.name}:') + for threading in ['ST', 'MT']: + output(f'### {"Single Thread" if threading == "ST" else "Mulitple Threads"}') + result_jsons = [self._run(threading, encoding) for encoding in self._encodings] + test_json = self._run(threading, self._config.encoding_under_test) + for (encoding, json) in zip(self._encodings, result_jsons): + check_command = ['./scripts/compare_benchmarks.py', json, test_json] + compare_result = check_output(check_command) + output(f'Result for {encoding} vs. {self._config.encoding_under_test}:') + output(compare_result.decode(encoding='utf-8')) + except Exception as ex: + error_message = f'Skipped {self.name} due to error {ex}.' + print_error(error_message) + output(error_message) + + @abstractmethod + def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> str: + pass + + @property + def _path(self) -> str: + pass + + @abstractmethod + def _output_path(self, threading: Literal['ST'] | Literal['MT']) -> str: + pass + + +class TPCHBenchmark(Benchmark): + name = 'tpch' + def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> str: + if threading == 'ST': + benchmark_path = self._path + st_output_path = self._output_path(threading, encoding) + st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-s', str(self._config.scale_factor), '-e', encoding] + print_debug(' '.join(st_command)) + st_output = check_output(st_command) + print_debug(st_output) + return st_output_path + else: + benchmark_path = self._path + st_output_path = self._output_path(threading, encoding) + st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-s', str(self._config.scale_factor), '-e', encoding, + '--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] + print_debug(' '.join(st_command)) + st_output = check_output(st_command) + print_debug(st_output) + return st_output_path + + + @property + def _path(self) -> str: + return path.join(self._config.build_path, 'hyriseBenchmarkTPCH') + + def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) -> str: + return path.join(self._config.tmp_path, f'{self.name}-{threading}-{encoding}.json') + + +class TPCDSBenchmark(Benchmark): + name = 'tpcds' + def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> None:# + # TPC-DS only supports integer scales + scaling_factor = max(1, int(self._config.scale_factor)) + if threading == 'ST': + benchmark_path = self._path + st_output_path = self._output_path(threading, encoding) + st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-s', str(scaling_factor), '-e', encoding] + print_debug(' '.join(st_command)) + st_output = check_output(st_command) + print_debug(st_output) + return st_output_path + else: + benchmark_path = self._path + st_output_path = self._output_path(threading, encoding) + st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-s', str(scaling_factor), '-e', encoding, + '--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] + print_debug(' '.join(st_command)) + st_output = check_output(st_command) + print_debug(st_output) + return st_output_path + + @property + def _path(self) -> str: + return path.join(self._config.build_path, 'hyriseBenchmarkTPCDS') + + def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) -> str: + return path.join(self._config.tmp_path, f'{self.name}-{threading}-{encoding}.json') + + +class JoinOrderBenchmark(Benchmark): + name = 'job' + def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> str: + if threading == 'ST': + benchmark_path = self._path + st_output_path = self._output_path(threading, encoding) + st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-e', encoding] + print_debug(' '.join(st_command)) + st_output = check_output(st_command) + print_debug(st_output) + return st_output_path + else: + benchmark_path = self._path + st_output_path = self._output_path(threading, encoding) + st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-e', encoding, + '--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] + print_debug(' '.join(st_command)) + st_output = check_output(st_command) + print_debug(st_output) + return st_output_path + + @property + def _path(self) -> str: + return path.join(self._config.build_path, 'hyriseBenchmarkJoinOrder') + + def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) -> str: + return path.join(self._config.tmp_path, f'{self.name}-{threading}-{encoding}.json') + + +class StarSchemaBenchmark(Benchmark): + name = 'ssb' + def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> None: + if threading == 'ST': + benchmark_path = self._path + st_output_path = self._output_path(threading, encoding) + st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-e', encoding] + print_debug(' '.join(st_command)) + st_output = check_output(st_command) + print_debug(st_output) + return st_output_path + else: + benchmark_path = self._path + st_output_path = self._output_path(threading, encoding) + st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-e', encoding, + '--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] + print_debug(' '.join(st_command)) + st_output = check_output(st_command) + print_debug(st_output) + return st_output_path + + @property + def _path(self) -> str: + return path.join(self._config.build_path, 'hyriseBenchmarkStarSchema') + + def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) -> str: + return path.join(self._config.tmp_path, f'{self.name}-{threading}-{encoding}.json') + + +def parse_arguments() -> Configuration: + global DEBUG_MODE + parser = ArgumentParser() + parser.add_argument( + '-o', + '--output-file', + dest='output_file', + type=str, + required=True, + help='The file where the output should be stored.' + ) + parser.add_argument( + '-b', + '--build-path', + dest='build_path', + type=str, + required=True, + help='Path where the executables to benchmark are located.' + ) + parser.add_argument( + '-s', + '--scale-factor', + dest='scale_factor', + type=float, + required=True, + help='The scale factor to pass to the benchmarks that support scaling. Note that this number might get rounded or ignored if necessary for a benchmark.' + ) + parser.add_argument( + '-t', + '--tmp-path', + dest='tmp_path', + type=str, + required=False, + default='tmp', + help='The directory where the benchmark result files will be stored.' + ) + parser.add_argument( + '-e', + '--encoding-under-test', + dest='encoding', + type=str, + required=True, + help='The name of the encoding to compare against built-in encodings.' + ) + parser.add_argument( + '-t', + '--timeout', + dest='timeout', + type=int, + required=False, + default=-1, + help='The timeout in seconds to pass to the benchmarks. Defaults to -1, i.e. no timeout.' + ) + parser.add_argument( + '-d', + '--debug', + dest='debug', + required=False, + default=False, + action=BooleanOptionalAction, + help='Whether to activate debug mode (more verbose output to stdout).' + ) + namespace = parser.parse_args() + if namespace.debug: + DEBUG_MODE = True + print_debug('Debug Mode enabled.') + return Configuration( + output_file=namespace.output_file, + build_path=namespace.build_path, + tmp_path=namespace.tmp_path, + scale_factor=namespace.scale_factor, + encoding_under_test=namespace.encoding, + time_limit=namespace.timeout, + debug=namespace.debug) + + +def scale_factor_for_benchmark(benchmark: str, scale_factor: float) -> float: + if benchmark == 'hyriseBenchmarkTPCDS': + return max(scale_factor, 1) + return scale_factor + + +def locate_benchmarks(benchmarks: list[str], config: Configuration) -> list[Benchmark]: + benchmark_objects: list[Benchmark] = [] + for benchmark in benchmarks: + benchmark_path = path.join(config.build_path, benchmark) + if not path.isfile(benchmark_path): + exit(f'Cannot locate {benchmark} at {benchmark_path}!') + benchmark_objects.append(Benchmark.create(benchmark)) + return benchmark_objects + + +def main(): + global output_file + config = parse_arguments() + with open(config.output_file, 'w+') as output_file: + benchmarks_names = ['hyriseBenchmarkTPCH', 'hyriseBenchmarkTPCDS', 'hyriseBenchmarkJoinOrder', 'hyriseBenchmarkStarSchema'] + benchmarks = locate_benchmarks(benchmarks_names, config) + for benchmark in benchmarks: + benchmark.run(config) + + +if __name__ == '__main__': + main() From 439a28bcf1250d08f7d51f051c72d73b802ea860 Mon Sep 17 00:00:00 2001 From: Philipp Keese Date: Wed, 21 Jun 2023 15:49:52 +0200 Subject: [PATCH 02/71] Remove leftover files before running benchmarks --- scripts/evaluate_string_segments.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/evaluate_string_segments.py b/scripts/evaluate_string_segments.py index 0b8f850108..a99e52763e 100755 --- a/scripts/evaluate_string_segments.py +++ b/scripts/evaluate_string_segments.py @@ -1,10 +1,11 @@ #!/usr/bin/env python3 - +import shutil from abc import ABC, abstractmethod from argparse import ArgumentParser, BooleanOptionalAction from dataclasses import dataclass import multiprocessing from os import path +from pathlib import Path from subprocess import check_output import sys from typing import Any, Literal @@ -24,6 +25,10 @@ def print_error(*args, **kwargs) -> None: def output(*args, **kwargs) -> None: print(*args, file=output_file, **kwargs) +def rm_dir(path: str) -> None: + dirpath = Path(path) + if dirpath.exists() and dirpath.is_dir(): + shutil.rmtree(dirpath) @dataclass(frozen=True) class Configuration: @@ -92,6 +97,7 @@ def _output_path(self, threading: Literal['ST'] | Literal['MT']) -> str: class TPCHBenchmark(Benchmark): name = 'tpch' def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> str: + rm_dir("tpch_cached_tables") if threading == 'ST': benchmark_path = self._path st_output_path = self._output_path(threading, encoding) @@ -121,7 +127,8 @@ def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) class TPCDSBenchmark(Benchmark): name = 'tpcds' - def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> None:# + def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> None: + rm_dir("tpcds_cached_tables") # TPC-DS only supports integer scales scaling_factor = max(1, int(self._config.scale_factor)) if threading == 'ST': @@ -236,7 +243,7 @@ def parse_arguments() -> Configuration: help='The scale factor to pass to the benchmarks that support scaling. Note that this number might get rounded or ignored if necessary for a benchmark.' ) parser.add_argument( - '-t', + '-p', '--tmp-path', dest='tmp_path', type=str, From 9f310b553f76477c952dc01da049b73c467450cb Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Wed, 21 Jun 2023 19:42:10 +0200 Subject: [PATCH 03/71] Fix incorrect timeout value * Also create temp-directory when not exists --- scripts/evaluate_string_segments.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/scripts/evaluate_string_segments.py b/scripts/evaluate_string_segments.py index a99e52763e..26baf878b9 100755 --- a/scripts/evaluate_string_segments.py +++ b/scripts/evaluate_string_segments.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import shutil from abc import ABC, abstractmethod -from argparse import ArgumentParser, BooleanOptionalAction +from argparse import ArgumentParser, ArgumentTypeError, BooleanOptionalAction from dataclasses import dataclass import multiprocessing from os import path @@ -217,6 +217,11 @@ def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) def parse_arguments() -> Configuration: global DEBUG_MODE + def check_positive(value: any) -> int: + ivalue = int(value) + if ivalue <= 0: + raise ArgumentTypeError("%s is an invalid positive int value" % value) + return ivalue parser = ArgumentParser() parser.add_argument( '-o', @@ -263,10 +268,10 @@ def parse_arguments() -> Configuration: '-t', '--timeout', dest='timeout', - type=int, + type=check_positive, required=False, - default=-1, - help='The timeout in seconds to pass to the benchmarks. Defaults to -1, i.e. no timeout.' + default=60, + help='The timeout in seconds to pass to the benchmarks. Defaults to 60.' ) parser.add_argument( '-d', @@ -310,6 +315,7 @@ def locate_benchmarks(benchmarks: list[str], config: Configuration) -> list[Benc def main(): global output_file config = parse_arguments() + Path(config.tmp_path).mkdir(parents=True, exist_ok=True) with open(config.output_file, 'w+') as output_file: benchmarks_names = ['hyriseBenchmarkTPCH', 'hyriseBenchmarkTPCDS', 'hyriseBenchmarkJoinOrder', 'hyriseBenchmarkStarSchema'] benchmarks = locate_benchmarks(benchmarks_names, config) From 1dc9f762869acea61d5f765854badba401a0e03a Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Wed, 21 Jun 2023 21:36:03 +0200 Subject: [PATCH 04/71] Apply feedback by @23mafi regarding code duplication --- scripts/evaluate_string_segments.py | 108 +++++++++------------------- 1 file changed, 32 insertions(+), 76 deletions(-) diff --git a/scripts/evaluate_string_segments.py b/scripts/evaluate_string_segments.py index 26baf878b9..d068d89bd6 100755 --- a/scripts/evaluate_string_segments.py +++ b/scripts/evaluate_string_segments.py @@ -81,8 +81,22 @@ def run(self, config: Configuration) -> None: print_error(error_message) output(error_message) - @abstractmethod def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> str: + self._pre_run_cleanup() + st_command = self._get_arguments(threading, encoding) + print_debug(' '.join(st_command)) + st_output = check_output(st_command) + print_debug(st_output) + return self._output_path(threading, encoding) + + def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str) -> list[str]: + arguments = [self._path, '-o', self._output_path(threading, encoding), '-t', str(self._config.time_limit), '-e', encoding] + if threading == 'MT': + arguments += ['--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] + return arguments + + @abstractmethod + def _pre_run_cleanup(self) -> None: pass @property @@ -96,26 +110,12 @@ def _output_path(self, threading: Literal['ST'] | Literal['MT']) -> str: class TPCHBenchmark(Benchmark): name = 'tpch' - def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> str: - rm_dir("tpch_cached_tables") - if threading == 'ST': - benchmark_path = self._path - st_output_path = self._output_path(threading, encoding) - st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-s', str(self._config.scale_factor), '-e', encoding] - print_debug(' '.join(st_command)) - st_output = check_output(st_command) - print_debug(st_output) - return st_output_path - else: - benchmark_path = self._path - st_output_path = self._output_path(threading, encoding) - st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-s', str(self._config.scale_factor), '-e', encoding, - '--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] - print_debug(' '.join(st_command)) - st_output = check_output(st_command) - print_debug(st_output) - return st_output_path + def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str) -> list[str]: + return super()._get_arguments(threading, encoding) + ['-s', str(self._config.scale_factor)] + + def _pre_run_cleanup(self) -> None: + rm_dir('tpch_cached_tables') @property def _path(self) -> str: @@ -127,27 +127,13 @@ def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) class TPCDSBenchmark(Benchmark): name = 'tpcds' - def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> None: - rm_dir("tpcds_cached_tables") + + def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str) -> list[str]: # TPC-DS only supports integer scales - scaling_factor = max(1, int(self._config.scale_factor)) - if threading == 'ST': - benchmark_path = self._path - st_output_path = self._output_path(threading, encoding) - st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-s', str(scaling_factor), '-e', encoding] - print_debug(' '.join(st_command)) - st_output = check_output(st_command) - print_debug(st_output) - return st_output_path - else: - benchmark_path = self._path - st_output_path = self._output_path(threading, encoding) - st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-s', str(scaling_factor), '-e', encoding, - '--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] - print_debug(' '.join(st_command)) - st_output = check_output(st_command) - print_debug(st_output) - return st_output_path + return super()._get_arguments(threading, encoding) + ['-s', str(max(1, int(self._config.scale_factor)))] + + def _pre_run_cleanup(self) -> None: + rm_dir('tpcds_cached_tables') @property def _path(self) -> str: @@ -159,24 +145,9 @@ def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) class JoinOrderBenchmark(Benchmark): name = 'job' - def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> str: - if threading == 'ST': - benchmark_path = self._path - st_output_path = self._output_path(threading, encoding) - st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-e', encoding] - print_debug(' '.join(st_command)) - st_output = check_output(st_command) - print_debug(st_output) - return st_output_path - else: - benchmark_path = self._path - st_output_path = self._output_path(threading, encoding) - st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-e', encoding, - '--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] - print_debug(' '.join(st_command)) - st_output = check_output(st_command) - print_debug(st_output) - return st_output_path + + def _pre_run_cleanup(self) -> None: + pass @property def _path(self) -> str: @@ -188,24 +159,9 @@ def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) class StarSchemaBenchmark(Benchmark): name = 'ssb' - def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> None: - if threading == 'ST': - benchmark_path = self._path - st_output_path = self._output_path(threading, encoding) - st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-e', encoding] - print_debug(' '.join(st_command)) - st_output = check_output(st_command) - print_debug(st_output) - return st_output_path - else: - benchmark_path = self._path - st_output_path = self._output_path(threading, encoding) - st_command = [benchmark_path, '-o', st_output_path, '-t', str(self._config.time_limit), '-e', encoding, - '--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] - print_debug(' '.join(st_command)) - st_output = check_output(st_command) - print_debug(st_output) - return st_output_path + + def _pre_run_cleanup(self) -> None: + pass @property def _path(self) -> str: From 5c731975e12917e9c327b350eb9c57d3a54f28a6 Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Wed, 21 Jun 2023 21:55:49 +0200 Subject: [PATCH 05/71] Add verbosity levels --- scripts/evaluate_string_segments.py | 57 +++++++++++++++++------------ 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/scripts/evaluate_string_segments.py b/scripts/evaluate_string_segments.py index d068d89bd6..b4c3718568 100755 --- a/scripts/evaluate_string_segments.py +++ b/scripts/evaluate_string_segments.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import shutil from abc import ABC, abstractmethod -from argparse import ArgumentParser, ArgumentTypeError, BooleanOptionalAction +from argparse import ArgumentParser, ArgumentTypeError from dataclasses import dataclass import multiprocessing from os import path @@ -11,10 +11,10 @@ from typing import Any, Literal -DEBUG_MODE = False +VERBOSITY_LEVEL = 0 -def print_debug(*args, **kwargs) -> None: - if DEBUG_MODE: +def print_debug(*args, required_verbosity_level: int, **kwargs) -> None: + if VERBOSITY_LEVEL >= required_verbosity_level: print(*args, **kwargs) @@ -22,14 +22,21 @@ def print_error(*args, **kwargs) -> None: print(*args, file=sys.stderr, **kwargs) -def output(*args, **kwargs) -> None: +def output(*args, required_verbosity_level, **kwargs) -> None: + print_debug(*args, required_verbosity_level=required_verbosity_level, **kwargs) print(*args, file=output_file, **kwargs) + def rm_dir(path: str) -> None: dirpath = Path(path) if dirpath.exists() and dirpath.is_dir(): shutil.rmtree(dirpath) + +def enquote(x: str, /, *, quoatation_character='"') -> str: + return f'{quoatation_character}{x}{quoatation_character}' + + @dataclass(frozen=True) class Configuration: output_file: str @@ -38,7 +45,7 @@ class Configuration: scale_factor: float encoding_under_test: str time_limit: int - debug: bool + verbosity_level: int class Benchmark(ABC): @@ -66,27 +73,28 @@ def create(name: str, *args: Any, **kwargs: Any) -> 'Benchmark': def run(self, config: Configuration) -> None: try: self._config = config - output(f'## Benchmark {self.name}:') + output(f'## Benchmark {self.name}:', required_verbosity_level=1) for threading in ['ST', 'MT']: - output(f'### {"Single Thread" if threading == "ST" else "Mulitple Threads"}') + output(f'### {"Single Thread" if threading == "ST" else "Mulitple Threads"}', required_verbosity_level=1) result_jsons = [self._run(threading, encoding) for encoding in self._encodings] test_json = self._run(threading, self._config.encoding_under_test) for (encoding, json) in zip(self._encodings, result_jsons): check_command = ['./scripts/compare_benchmarks.py', json, test_json] compare_result = check_output(check_command) - output(f'Result for {encoding} vs. {self._config.encoding_under_test}:') - output(compare_result.decode(encoding='utf-8')) + output(f'Result for {encoding} vs. {self._config.encoding_under_test}:', required_verbosity_level=3) + output(compare_result.decode(encoding='utf-8'), required_verbosity_level=3) except Exception as ex: error_message = f'Skipped {self.name} due to error {ex}.' print_error(error_message) - output(error_message) + output(error_message, required_verbosity_level=0) def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> str: self._pre_run_cleanup() + print_debug(f'Running benchmark {self.name} for encoding {encoding}.', required_verbosity_level=1) st_command = self._get_arguments(threading, encoding) - print_debug(' '.join(st_command)) + print_debug(f'Command: `{" ".join(map(lambda x: enquote(x), st_command))}`', required_verbosity_level=2) st_output = check_output(st_command) - print_debug(st_output) + print_debug(f'Output of above command: `{st_output}`', required_verbosity_level=3) return self._output_path(threading, encoding) def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str) -> list[str]: @@ -172,7 +180,7 @@ def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) def parse_arguments() -> Configuration: - global DEBUG_MODE + global VERBOSITY_LEVEL def check_positive(value: any) -> int: ivalue = int(value) if ivalue <= 0: @@ -230,18 +238,18 @@ def check_positive(value: any) -> int: help='The timeout in seconds to pass to the benchmarks. Defaults to 60.' ) parser.add_argument( - '-d', - '--debug', - dest='debug', + '-v', + '--verbose', + dest='verbosity_level', required=False, - default=False, - action=BooleanOptionalAction, - help='Whether to activate debug mode (more verbose output to stdout).' + default=0, + action='count', + help='Verbosity level. Supports multiple ocurrences (like `-vvv`) to increase verbosity.' ) namespace = parser.parse_args() - if namespace.debug: - DEBUG_MODE = True - print_debug('Debug Mode enabled.') + if namespace.verbosity_level > 0: + VERBOSITY_LEVEL = namespace.verbosity_level + print_debug(f'Verbosity Mode enabled on level {VERBOSITY_LEVEL}.', required_verbosity_level=VERBOSITY_LEVEL) return Configuration( output_file=namespace.output_file, build_path=namespace.build_path, @@ -249,7 +257,7 @@ def check_positive(value: any) -> int: scale_factor=namespace.scale_factor, encoding_under_test=namespace.encoding, time_limit=namespace.timeout, - debug=namespace.debug) + verbosity_level=namespace.verbosity_level) def scale_factor_for_benchmark(benchmark: str, scale_factor: float) -> float: @@ -273,6 +281,7 @@ def main(): config = parse_arguments() Path(config.tmp_path).mkdir(parents=True, exist_ok=True) with open(config.output_file, 'w+') as output_file: + print_debug(f'Running benchmark comparing {config.encoding_under_test} Encoding against built-in encodings.', required_verbosity_level=1) benchmarks_names = ['hyriseBenchmarkTPCH', 'hyriseBenchmarkTPCDS', 'hyriseBenchmarkJoinOrder', 'hyriseBenchmarkStarSchema'] benchmarks = locate_benchmarks(benchmarks_names, config) for benchmark in benchmarks: From 2bb38ba53792ce775af744d3302e0953ddab2151 Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Thu, 22 Jun 2023 15:42:19 +0200 Subject: [PATCH 06/71] Use longer timeout for multithreading --- scripts/evaluate_string_segments.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/evaluate_string_segments.py b/scripts/evaluate_string_segments.py index b4c3718568..d9a3054968 100755 --- a/scripts/evaluate_string_segments.py +++ b/scripts/evaluate_string_segments.py @@ -98,9 +98,15 @@ def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> str: return self._output_path(threading, encoding) def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str) -> list[str]: - arguments = [self._path, '-o', self._output_path(threading, encoding), '-t', str(self._config.time_limit), '-e', encoding] + arguments = [self._path, '-o', self._output_path(threading, encoding), '-e', encoding] if threading == 'MT': arguments += ['--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] + # Multithreaded runs need longer times to be meaningful. Default to 20 minutes. + arguments += ['-t', str(self._config.time_limit * 20)] + else: + arguments += ['-t', str(self._config.time_limit)] + if metrics: + arguments += ['--metrics', '-r', '1'] return arguments @abstractmethod From 474d615a7f8d3e2200a2ea7b3113e66d194f0044 Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Mon, 26 Jun 2023 18:51:52 +0200 Subject: [PATCH 07/71] Add metrics to script and improve output --- scripts/evaluate_string_segments.py | 200 ++++++++++++++++++++++------ 1 file changed, 157 insertions(+), 43 deletions(-) diff --git a/scripts/evaluate_string_segments.py b/scripts/evaluate_string_segments.py index d9a3054968..862f26bf52 100755 --- a/scripts/evaluate_string_segments.py +++ b/scripts/evaluate_string_segments.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import json import shutil from abc import ABC, abstractmethod from argparse import ArgumentParser, ArgumentTypeError @@ -6,9 +7,14 @@ import multiprocessing from os import path from pathlib import Path +import statistics from subprocess import check_output import sys -from typing import Any, Literal +from datetime import datetime +from typing import Any, Literal, Mapping + +import pandas as pd +import matplotlib.pyplot as plt VERBOSITY_LEVEL = 0 @@ -24,7 +30,7 @@ def print_error(*args, **kwargs) -> None: def output(*args, required_verbosity_level, **kwargs) -> None: print_debug(*args, required_verbosity_level=required_verbosity_level, **kwargs) - print(*args, file=output_file, **kwargs) + # print(*args, file=output_file, **kwargs) def rm_dir(path: str) -> None: @@ -37,15 +43,39 @@ def enquote(x: str, /, *, quoatation_character='"') -> str: return f'{quoatation_character}{x}{quoatation_character}' +def read_json(path: str) -> dict: + with open(path) as file: + return json.load(file) + + @dataclass(frozen=True) class Configuration: - output_file: str + output_directory: str build_path: str tmp_path: str scale_factor: float encoding_under_test: str time_limit: int verbosity_level: int + + +def plot(results: Mapping[str, float], *, title: str, yaxis: str, path: str, figsize: tuple[int, int]=(15, 10)) -> None: + f, axiis = plt.subplots(1, 1, figsize=figsize) + data=pd.DataFrame(data=results) + print_debug(data, required_verbosity_level=3) + if data.empty: + print_error('Data Frame is empty; no result data to show!') + return + data.plot( + kind='box', + ax=axiis, + title=title, + xlabel='Encodings', + ylabel=f'{yaxis} (Logarithmic Scale)', + logy=True, + ) + f.tight_layout() + f.savefig(path) class Benchmark(ABC): @@ -76,29 +106,120 @@ def run(self, config: Configuration) -> None: output(f'## Benchmark {self.name}:', required_verbosity_level=1) for threading in ['ST', 'MT']: output(f'### {"Single Thread" if threading == "ST" else "Mulitple Threads"}', required_verbosity_level=1) - result_jsons = [self._run(threading, encoding) for encoding in self._encodings] - test_json = self._run(threading, self._config.encoding_under_test) - for (encoding, json) in zip(self._encodings, result_jsons): - check_command = ['./scripts/compare_benchmarks.py', json, test_json] - compare_result = check_output(check_command) - output(f'Result for {encoding} vs. {self._config.encoding_under_test}:', required_verbosity_level=3) - output(compare_result.decode(encoding='utf-8'), required_verbosity_level=3) + result_jsons = [self._run(threading, encoding, False) for encoding in self._encodings] + test_json = self._run(threading, self._config.encoding_under_test, False) + times = {encoding: self._compare_run(result_json) for result_json, encoding in zip(result_jsons, self._encodings)} + times[self._config.encoding_under_test] = self._compare_run(test_json) + plot_path = path.join(self._config.output_directory, 'runtime', 'plots', f'{self.name}-{threading}.png') + Path(plot_path).parent.mkdir(parents=True, exist_ok=True) + plot(times, title=f'Items per second for {self.name}', yaxis='Items per second of individual benchmark tests', path=plot_path) + # Dump raw data + raw_file_path = path.join(self._config.output_directory, 'runtime', 'raw', f'{self.name}-{threading}.json') + Path(raw_file_path).parent.mkdir(parents=True, exist_ok=True) + with open(raw_file_path, 'w') as f: + json.dump(times, f) + # for (encoding, json) in zip(self._encodings, result_jsons): + # check_command = ['./scripts/compare_benchmarks.py', json, test_json] + # compare_result = check_output(check_command) + # output(f'Result for {encoding} vs. {self._config.encoding_under_test}:', required_verbosity_level=3) + # output(compare_result.decode(encoding='utf-8'), required_verbosity_level=3) except Exception as ex: error_message = f'Skipped {self.name} due to error {ex}.' print_error(error_message) - output(error_message, required_verbosity_level=0) - - def _run(self, threading: Literal['ST', 'MT'], encoding: str) -> str: + output(error_message, required_verbosity_level=4) + + def _compare_run(self, json: str) -> list[float]: + @dataclass(frozen=True) + class Runtime: + benchmark_name: str + runtime: float + @dataclass + class Runtimes: + runtimes: list[Runtime] + json_file = read_json(json) + runtimes = Runtimes([]) + for benchmark in json_file['benchmarks']: + name = benchmark['name'] + # successful_runs = benchmark['successful_runs'] + duration = benchmark['items_per_second'] + # duration = statistics.mean(float(run['duration']) for run in successful_runs) + runtime = Runtime(name, duration) + runtimes.runtimes.append(runtime) + return list(map(lambda x: x.runtime, runtimes.runtimes)) + + def compare_metrics(self, config: Configuration) -> None: + try: + self._config = config + output(f'### Collecting Metrics for {self.name}:', required_verbosity_level=1) + result_jsons = [self._run('ST', encoding, True) for encoding in self._encodings] + test_json = self._run('ST', self._config.encoding_under_test, True) + metrics = {encoding: self._compare_metrics(result_json) for result_json, encoding in zip(result_jsons, self._encodings)} + metrics[self._config.encoding_under_test]= self._compare_metrics(test_json) + plot_path = path.join(self._config.output_directory, 'metrics', 'plots', f'{self.name}.png') + Path(plot_path).parent.mkdir(parents=True, exist_ok=True) + plot(metrics, title=f'Sizes for {self.name}', yaxis='Size of Segments', path=plot_path) + # Dump raw data + raw_file_path = path.join(self._config.output_directory, 'metrics', 'raw', f'{self.name}.json') + Path(raw_file_path).parent.mkdir(parents=True, exist_ok=True) + with open(raw_file_path, 'w') as f: + json.dump(metrics, f) + # for encoding, json in zip(self._encodings, result_jsons): + # self._compare_metrics(json, test_json, encoding, self._config.encoding_under_test) + except Exception as ex: + error_message = f'Skipped {self.name} due to error {ex}.' + print_error(error_message) + output(error_message, required_verbosity_level=4) + + # def _compare_metrics(self, reference_json: str, test_json: str, reference_encoding: str, test_encoding: str) -> 'Metrics': + def _compare_metrics(self, json: str) -> list[int]: + @dataclass(frozen=True) + class MemoryConsumption: + column_name: str + column_type: str + memory_consumption: int + @dataclass + class Metrics: + memory_consumptions: list[MemoryConsumption] + + def min(self) -> int: + return min(map(lambda x: x.memory_consumption, self.memory_consumptions)) + + def max(self) -> int: + return max(map(lambda x: x.memory_consumption, self.memory_consumptions)) + + def average(self) -> float: + return statistics.fmean(map(lambda x: x.memory_consumption, self.memory_consumptions)) + + def median(self) -> float: + return statistics.median(map(lambda x: x.memory_consumption, self.memory_consumptions)) + json_file = read_json(json) + # test = read_json(test_json) + metrics = Metrics([]) + # test_metrics = Metrics([]) + for segment in json_file['segments']: + # for ref_segment, test_segment in zip(reference['segments'], test['segments']): + column_type = segment['column_data_type'] + column_name = segment['column_name'] + # Only look at string columns + if column_type != 'string': + continue + metrics.memory_consumptions.append(MemoryConsumption(column_name, column_type, segment["estimated_size_in_bytes"])) + # test_metrics.memory_consumptions.append(MemoryConsumption(column_name, column_type, test_segment["estimated_size_in_bytes"])) + # output(f'For reference encoding {reference_encoding}: {{ min: {reference_metrics.min()}, max: {reference_metrics.max()}, average: {reference_metrics.average()}, median: {reference_metrics.median()} }}; ' + + # f'For test encoding {test_encoding}: {{ min: {test_metrics.min()}, max: {test_metrics.max()}, average: {test_metrics.average()}, median: {test_metrics.median()} }}', required_verbosity_level=2) + return list(map(lambda x: x.memory_consumption, metrics.memory_consumptions)) + + def _run(self, threading: Literal['ST', 'MT'], encoding: str, metrics: bool) -> str: self._pre_run_cleanup() print_debug(f'Running benchmark {self.name} for encoding {encoding}.', required_verbosity_level=1) - st_command = self._get_arguments(threading, encoding) + st_command = self._get_arguments(threading, encoding, metrics) print_debug(f'Command: `{" ".join(map(lambda x: enquote(x), st_command))}`', required_verbosity_level=2) st_output = check_output(st_command) print_debug(f'Output of above command: `{st_output}`', required_verbosity_level=3) - return self._output_path(threading, encoding) + return self._output_path(threading, encoding, metrics) - def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str) -> list[str]: - arguments = [self._path, '-o', self._output_path(threading, encoding), '-e', encoding] + def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str, metrics: bool) -> list[str]: + arguments = [self._path, '-o', self._output_path(threading, encoding, metrics), '-e', encoding] if threading == 'MT': arguments += ['--scheduler', '--clients', str(multiprocessing.cpu_count() // 4), '--mode=Shuffled'] # Multithreaded runs need longer times to be meaningful. Default to 20 minutes. @@ -117,16 +238,15 @@ def _pre_run_cleanup(self) -> None: def _path(self) -> str: pass - @abstractmethod - def _output_path(self, threading: Literal['ST'] | Literal['MT']) -> str: - pass + def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str, metrics: bool) -> str: + return path.join(self._config.tmp_path, f'{self.name}-{threading}-{encoding}-{metrics}.json') class TPCHBenchmark(Benchmark): name = 'tpch' - def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str) -> list[str]: - return super()._get_arguments(threading, encoding) + ['-s', str(self._config.scale_factor)] + def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str, metrics: bool) -> list[str]: + return super()._get_arguments(threading, encoding, metrics) + ['-s', str(self._config.scale_factor)] def _pre_run_cleanup(self) -> None: rm_dir('tpch_cached_tables') @@ -135,16 +255,13 @@ def _pre_run_cleanup(self) -> None: def _path(self) -> str: return path.join(self._config.build_path, 'hyriseBenchmarkTPCH') - def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) -> str: - return path.join(self._config.tmp_path, f'{self.name}-{threading}-{encoding}.json') - class TPCDSBenchmark(Benchmark): name = 'tpcds' - def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str) -> list[str]: + def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str, metrics: bool) -> list[str]: # TPC-DS only supports integer scales - return super()._get_arguments(threading, encoding) + ['-s', str(max(1, int(self._config.scale_factor)))] + return super()._get_arguments(threading, encoding, metrics) + ['-s', str(max(1, int(self._config.scale_factor)))] def _pre_run_cleanup(self) -> None: rm_dir('tpcds_cached_tables') @@ -153,37 +270,28 @@ def _pre_run_cleanup(self) -> None: def _path(self) -> str: return path.join(self._config.build_path, 'hyriseBenchmarkTPCDS') - def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) -> str: - return path.join(self._config.tmp_path, f'{self.name}-{threading}-{encoding}.json') - class JoinOrderBenchmark(Benchmark): name = 'job' def _pre_run_cleanup(self) -> None: - pass + rm_dir('imdb_data') @property def _path(self) -> str: return path.join(self._config.build_path, 'hyriseBenchmarkJoinOrder') - def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) -> str: - return path.join(self._config.tmp_path, f'{self.name}-{threading}-{encoding}.json') - class StarSchemaBenchmark(Benchmark): name = 'ssb' def _pre_run_cleanup(self) -> None: - pass + rm_dir('imdb_data') @property def _path(self) -> str: return path.join(self._config.build_path, 'hyriseBenchmarkStarSchema') - def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str) -> str: - return path.join(self._config.tmp_path, f'{self.name}-{threading}-{encoding}.json') - def parse_arguments() -> Configuration: global VERBOSITY_LEVEL @@ -195,11 +303,11 @@ def check_positive(value: any) -> int: parser = ArgumentParser() parser.add_argument( '-o', - '--output-file', - dest='output_file', + '--output-directory', + dest='output_directory', type=str, required=True, - help='The file where the output should be stored.' + help='The directory where the output should be stored.' ) parser.add_argument( '-b', @@ -256,8 +364,11 @@ def check_positive(value: any) -> int: if namespace.verbosity_level > 0: VERBOSITY_LEVEL = namespace.verbosity_level print_debug(f'Verbosity Mode enabled on level {VERBOSITY_LEVEL}.', required_verbosity_level=VERBOSITY_LEVEL) + now = datetime.now() + now_date = f'{now.year}{now.month:02d}{now.day:02d}' + now_time = f'{now.hour}{now.minute:02d}{now.second:02d}' return Configuration( - output_file=namespace.output_file, + output_directory=path.join(namespace.output_directory, f'run-{now_date}-{now_time}'), build_path=namespace.build_path, tmp_path=namespace.tmp_path, scale_factor=namespace.scale_factor, @@ -286,12 +397,15 @@ def main(): global output_file config = parse_arguments() Path(config.tmp_path).mkdir(parents=True, exist_ok=True) - with open(config.output_file, 'w+') as output_file: + Path(config.output_directory).mkdir(parents=True, exist_ok=True) + if True: + # with open(config.output_file, 'w+') as output_file: print_debug(f'Running benchmark comparing {config.encoding_under_test} Encoding against built-in encodings.', required_verbosity_level=1) benchmarks_names = ['hyriseBenchmarkTPCH', 'hyriseBenchmarkTPCDS', 'hyriseBenchmarkJoinOrder', 'hyriseBenchmarkStarSchema'] benchmarks = locate_benchmarks(benchmarks_names, config) for benchmark in benchmarks: benchmark.run(config) + benchmark.compare_metrics(config) if __name__ == '__main__': From 6883969716fc9463a89eb5f3bc8acb60e5b28280 Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Tue, 27 Jun 2023 12:25:38 +0200 Subject: [PATCH 08/71] Add Segment [skip ci] --- .../operators/table_scan_sorted_benchmark.cpp | 1 + src/lib/CMakeLists.txt | 2 +- .../import_export/binary/binary_parser.cpp | 23 +++ .../import_export/binary/binary_parser.hpp | 4 + src/lib/operators/print.cpp | 4 + src/lib/storage/dictionary_segment.hpp | 2 +- src/lib/storage/encoding_type.hpp | 11 +- .../variable_length_string_segment.cpp | 142 ++++++++++++++++++ .../variable_length_string_segment.hpp | 100 ++++++++++++ 9 files changed, 286 insertions(+), 3 deletions(-) create mode 100644 src/lib/storage/variable_length_string_segment.cpp create mode 100644 src/lib/storage/variable_length_string_segment.hpp diff --git a/src/benchmark/operators/table_scan_sorted_benchmark.cpp b/src/benchmark/operators/table_scan_sorted_benchmark.cpp index 3a42b4a9c6..aee31d0868 100644 --- a/src/benchmark/operators/table_scan_sorted_benchmark.cpp +++ b/src/benchmark/operators/table_scan_sorted_benchmark.cpp @@ -207,6 +207,7 @@ void registerTableScanSortedBenchmarks() { {"None", EncodingAndSupportedDataTypes(EncodingType::Unencoded, {"Int", "String"})}, {"Dictionary", EncodingAndSupportedDataTypes(EncodingType::Dictionary, {"Int", "String"})}, {"FixedStringDictionary", EncodingAndSupportedDataTypes(EncodingType::FixedStringDictionary, {"String"})}, + {"VariableStringLengthDictionary", EncodingAndSupportedDataTypes(EncodingType::VariableStringLengthDictionary, {"String"})}, {"FrameOfReference", EncodingAndSupportedDataTypes(EncodingType::FrameOfReference, {"Int"})}, {"RunLength", EncodingAndSupportedDataTypes(EncodingType::RunLength, {"Int", "String"})}, {"LZ4", EncodingAndSupportedDataTypes(EncodingType::LZ4, {"Int", "String"})}}; diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index 289b3e3e35..1e49c6301a 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -666,7 +666,7 @@ set( visualization/viz_record_layout.cpp visualization/viz_record_layout.hpp ${CMAKE_BINARY_DIR}/version.hpp -) + storage/variable_length_string_segment.hpp storage/variable_length_string_segment.cpp) set( LIBRARIES diff --git a/src/lib/import_export/binary/binary_parser.cpp b/src/lib/import_export/binary/binary_parser.cpp index 3bf12acb6c..84b7d68ad6 100644 --- a/src/lib/import_export/binary/binary_parser.cpp +++ b/src/lib/import_export/binary/binary_parser.cpp @@ -158,6 +158,8 @@ std::shared_ptr BinaryParser::_import_segment(std::ifstream& fi } else { Fail("Unsupported data type for FixedStringDictionary encoding"); } + case EncodingType::VariableStringLengthDictionary: + return _import_variable_string_length_segment(file, row_count); case EncodingType::RunLength: return _import_run_length_segment(file, row_count); case EncodingType::FrameOfReference: @@ -202,6 +204,27 @@ std::shared_ptr> BinaryParser::_import_dictionary_segment(s return std::make_shared>(dictionary, attribute_vector); } +std::shared_ptr BinaryParser::_import_variable_string_length_segment(std::ifstream& file, + ChunkOffset row_count) { + const auto compressed_vector_type_id = _read_value(file); + const auto dictionary_size = _read_value(file); + const auto strings = _read_values(file, dictionary_size); + + auto attribute_vector = _import_attribute_vector(file, row_count, compressed_vector_type_id); + auto total_length_of_strings = size_t{0}; + std::for_each(strings.cbegin(), strings.cend(), [&total_length_of_strings](const pmr_string& string) { + total_length_of_strings += string.size(); + }); + auto dictionary = std::make_shared>(total_length_of_strings); + auto offset = size_t{0}; + for (const auto& string : strings) { + strcpy(dictionary->data() + offset, string.data()); + offset += string.size(); + } + + return std::make_shared(dictionary, attribute_vector); +} + std::shared_ptr> BinaryParser::_import_fixed_string_dictionary_segment( std::ifstream& file, ChunkOffset row_count) { const auto compressed_vector_type_id = _read_value(file); diff --git a/src/lib/import_export/binary/binary_parser.hpp b/src/lib/import_export/binary/binary_parser.hpp index f36d0133db..287432a3d0 100644 --- a/src/lib/import_export/binary/binary_parser.hpp +++ b/src/lib/import_export/binary/binary_parser.hpp @@ -16,6 +16,7 @@ #include "storage/run_length_segment.hpp" #include "storage/table.hpp" #include "storage/value_segment.hpp" +#include "storage/variable_length_string_segment.hpp" #include "storage/vector_compression/bitpacking/bitpacking_vector_type.hpp" namespace hyrise { @@ -76,6 +77,9 @@ class BinaryParser { template static std::shared_ptr> _import_dictionary_segment(std::ifstream& file, ChunkOffset row_count); + static std::shared_ptr _import_variable_string_length_segment(std::ifstream& file, + ChunkOffset row_count); + static std::shared_ptr> _import_fixed_string_dictionary_segment( std::ifstream& file, ChunkOffset row_count); diff --git a/src/lib/operators/print.cpp b/src/lib/operators/print.cpp index 9becc974d0..7406a464c3 100644 --- a/src/lib/operators/print.cpp +++ b/src/lib/operators/print.cpp @@ -235,6 +235,10 @@ std::string Print::_segment_type(const std::shared_ptr& segment segment_type += "FSD"; break; } + case EncodingType::VariableStringLengthDictionary: { + segment_type += "VSL"; + break; + } case EncodingType::FrameOfReference: { segment_type += "FoR"; break; diff --git a/src/lib/storage/dictionary_segment.hpp b/src/lib/storage/dictionary_segment.hpp index f7bd7468dc..6a7787fb28 100644 --- a/src/lib/storage/dictionary_segment.hpp +++ b/src/lib/storage/dictionary_segment.hpp @@ -12,7 +12,7 @@ namespace hyrise { class BaseCompressedVector; /** - * @brief Segment implementing dictionary encoding + * @brief Segment implementing dictionary encoding. * * Uses vector compression schemes for its attribute vector. */ diff --git a/src/lib/storage/encoding_type.hpp b/src/lib/storage/encoding_type.hpp index f38d301f61..b066a2a612 100644 --- a/src/lib/storage/encoding_type.hpp +++ b/src/lib/storage/encoding_type.hpp @@ -20,7 +20,15 @@ namespace hyrise { namespace hana = boost::hana; -enum class EncodingType : uint8_t { Unencoded, Dictionary, RunLength, FixedStringDictionary, FrameOfReference, LZ4 }; +enum class EncodingType : uint8_t { + Unencoded, + Dictionary, + RunLength, + FixedStringDictionary, + FrameOfReference, + LZ4, + VariableStringLengthDictionary +}; std::ostream& operator<<(std::ostream& stream, const EncodingType encoding_type); @@ -37,6 +45,7 @@ constexpr auto supported_data_types_for_encoding_type = hana::make_map( hana::make_pair(enum_c, data_types), hana::make_pair(enum_c, data_types), hana::make_pair(enum_c, hana::tuple_t), + hana::make_pair(enum_c, hana::tuple_t), hana::make_pair(enum_c, hana::tuple_t), hana::make_pair(enum_c, data_types)); diff --git a/src/lib/storage/variable_length_string_segment.cpp b/src/lib/storage/variable_length_string_segment.cpp new file mode 100644 index 0000000000..4072e6db2c --- /dev/null +++ b/src/lib/storage/variable_length_string_segment.cpp @@ -0,0 +1,142 @@ +#include "variable_length_string_segment.hpp" +#include + +#include "resolve_type.hpp" + +namespace hyrise { + +VariableLengthStringSegment::VariableLengthStringSegment( + const std::shared_ptr>& dictionary, + const std::shared_ptr& attribute_vector) + : BaseDictionarySegment(data_type_from_type()), + _dictionary{dictionary}, + _attribute_vector{attribute_vector}, + _decompressor{attribute_vector->create_base_decompressor()}, + _unique_value_count(std::count(dictionary->begin(), dictionary->end(), '\0')), + _offset_vector{_generate_offset_vector()} { + // NULL is represented by _offset_vector.size(). INVALID_VALUE_ID, which is the highest possible number in + // ValueID::base_type (2^32 - 1), is needed to represent "value not found" in calls to lower_bound/upper_bound. + // For a VariableLengthStringSegment of the max size Chunk::MAX_SIZE, those two values overlap. + + Assert(_offset_vector->size() < std::numeric_limits::max(), "Input segment too big"); +} + +std::shared_ptr> VariableLengthStringSegment::dictionary() const { + return _dictionary; +} + +AllTypeVariant VariableLengthStringSegment::operator[](const ChunkOffset chunk_offset) const { + PerformanceWarning("operator[] used"); + DebugAssert(chunk_offset != INVALID_CHUNK_OFFSET, "Passed chunk offset must be valid."); + access_counter[SegmentAccessCounter::AccessType::Dictionary] += 1; + const auto value = get_typed_value(chunk_offset); + return value ? value.value() : NULL_VALUE; +} + +ChunkOffset VariableLengthStringSegment::size() const { + return static_cast(_attribute_vector->size()); +} + +std::shared_ptr VariableLengthStringSegment::copy_using_allocator( + const PolymorphicAllocator& alloc) const { + auto new_attribute_vector = _attribute_vector->copy_using_allocator(alloc); + auto new_dictionary = std::make_shared>(*_dictionary, alloc); + auto copy = std::make_shared(std::move(new_dictionary), std::move(new_attribute_vector)); + copy->access_counter = access_counter; + return copy; +} + +size_t VariableLengthStringSegment::memory_usage(const MemoryUsageCalculationMode mode) const { + return _attribute_vector->data_size() + _dictionary->capacity() + _offset_vector->capacity(); +} + +std::optional VariableLengthStringSegment::compressed_vector_type() const { + return _attribute_vector->type(); +} + +EncodingType VariableLengthStringSegment::encoding_type() const { + return EncodingType::VariableStringLengthDictionary; +} + +ValueID VariableLengthStringSegment::lower_bound(const AllTypeVariant& value) const { + DebugAssert(!variant_is_null(value), "Null value passed."); +// access_counter[SegmentAccessCounter::AccessType::Dictionary] += +// static_cast(std::ceil(std::log2(_offset_vector->size()))); + const auto typed_value = boost::get(value); + + auto args = std::vector(_offset_vector->size()); + std::iota(args.begin(), args.end(), 0); + auto it = std::lower_bound(args.cbegin(), args.cend(), typed_value, + [this](const ValueID valueId, const auto to_find) { + return typed_value_of_value_id(valueId) < to_find; + }); + if (it == args.cend()) { + return INVALID_VALUE_ID; + } + return ValueID{static_cast(std::distance(args.cbegin(), it))}; + +// auto iter = std::lower_bound(_dictionary->cbegin(), _dictionary->cend(), typed_value); +// if (iter == _dictionary->cend()) { +// return INVALID_VALUE_ID; +// } +// return ValueID{static_cast(std::distance(_dictionary->cbegin(), iter))}; +} + +ValueID VariableLengthStringSegment::upper_bound(const AllTypeVariant& value) const { + DebugAssert(!variant_is_null(value), "Null value passed."); +// access_counter[SegmentAccessCounter::AccessType::Dictionary] += +// static_cast(std::ceil(std::log2(_offset_vector->size()))); + const auto typed_value = boost::get(value); + + auto args = std::vector(_offset_vector->size()); + std::iota(args.begin(), args.end(), 0); + auto it = std::upper_bound(args.cbegin(), args.cend(), typed_value, + [this](const auto to_find, const ValueID valueID) { + return to_find < typed_value_of_value_id(valueID); + }); + if (it == args.cend()) { + return INVALID_VALUE_ID; + } + return ValueID{static_cast(std::distance(args.cbegin(), it))}; +} + +AllTypeVariant VariableLengthStringSegment::value_of_value_id(const ValueID value_id) const { + return typed_value_of_value_id(value_id); +} + +pmr_string VariableLengthStringSegment::typed_value_of_value_id(const ValueID value_id) const { + DebugAssert(value_id < _offset_vector->size(), "ValueID out of bounds"); + access_counter[SegmentAccessCounter::AccessType::Dictionary] += 1; + return pmr_string{_dictionary->data() + _offset_vector->operator[](value_id)}; +} + +ValueID::base_type VariableLengthStringSegment::unique_values_count() const { + return _unique_value_count; +} + +std::shared_ptr VariableLengthStringSegment::attribute_vector() const { + return _attribute_vector; +} + +ValueID VariableLengthStringSegment::null_value_id() const { + return ValueID{static_cast(_offset_vector->size())}; +} + +std::unique_ptr> VariableLengthStringSegment::_generate_offset_vector() const { + auto offset_vector = std::make_unique>(); + const auto dictionary_size = _dictionary->size(); + if (dictionary_size == 0) { + offset_vector->shrink_to_fit(); + return offset_vector; + } + offset_vector->reserve(_unique_value_count); + // First offset is always 0. + for (auto offset = size_t{0}; offset < dictionary_size; ++offset) { + offset_vector->push_back(offset); + while (_dictionary->operator[](offset++) != '\0'); + } + offset_vector->shrink_to_fit(); + return offset_vector; +} + +} // namespace hyrise diff --git a/src/lib/storage/variable_length_string_segment.hpp b/src/lib/storage/variable_length_string_segment.hpp new file mode 100644 index 0000000000..73deb5bbf8 --- /dev/null +++ b/src/lib/storage/variable_length_string_segment.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include +#include + +#include "base_dictionary_segment.hpp" +#include "storage/vector_compression/base_compressed_vector.hpp" +#include "types.hpp" + +namespace hyrise { + +class BaseCompressedVector; + +/** + * @brief Segment implementing variable length string encoding. + * + * Uses vector compression schemes for its attribute vector. + */ +class VariableLengthStringSegment : public BaseDictionarySegment { + public: + explicit VariableLengthStringSegment(const std::shared_ptr>& dictionary, + const std::shared_ptr& attribute_vector); + + // returns an underlying dictionary + std::shared_ptr> dictionary() const; + + /** + * @defgroup AbstractSegment interface + * @{ + */ + + AllTypeVariant operator[](const ChunkOffset chunk_offset) const final; + + std::optional get_typed_value(const ChunkOffset chunk_offset) const { + // performance critical - not in cpp to help with inlining + const auto offset = _decompressor->get(chunk_offset); + if (offset == _dictionary->size()) { + return std::nullopt; + } + + return pmr_string(_dictionary->data() + offset); + } + + ChunkOffset size() const final; + + std::shared_ptr copy_using_allocator(const PolymorphicAllocator& alloc) const final; + + size_t memory_usage(const MemoryUsageCalculationMode mode) const final; + /**@}*/ + + /** + * @defgroup AbstractEncodedSegment interface + * @{ + */ + std::optional compressed_vector_type() const final; + /**@}*/ + + /** + * @defgroup BaseDictionarySegment interface + * @{ + */ + EncodingType encoding_type() const final; + + // Returns the first valueId ID that refers to a valueId >= the search valueId and INVALID_VALUE_ID if all values are + // smaller than the search valueId. Here, INVALID_VALUE_ID does not represent NULL (which isn't stored in the + // dictionary anyway). Imagine a segment with values from 1 to 10. A scan for `WHERE a < 12` would retrieve + // `lower_bound(12) == INVALID_VALUE_ID` and compare all values in the attribute vector to `< INVALID_VALUE_ID`. + // Thus, returning INVALID_VALUE_ID makes comparisons much easier. However, the caller has to make sure that + // NULL values stored in the attribute vector (stored with a valueId ID of unique_values_count()) are excluded. + // See #1471 for a deeper discussion. + ValueID lower_bound(const AllTypeVariant& valueId) const final; + + // Returns the first value ID that refers to a value > the search value and INVALID_VALUE_ID if all values are + // smaller than or equal to the search value (see also lower_bound). + ValueID upper_bound(const AllTypeVariant& value) const final; + + AllTypeVariant value_of_value_id(const ValueID value_id) const final; + + pmr_string typed_value_of_value_id(const ValueID value_id) const; + + ValueID::base_type unique_values_count() const final; + + std::shared_ptr attribute_vector() const final; + + ValueID null_value_id() const final; + + protected: + using Offset = size_t; + const std::shared_ptr> _dictionary; + // Maps chunk offsets to dictionary offsets. + const std::shared_ptr _attribute_vector; + std::unique_ptr _decompressor; + const size_t _unique_value_count; + // Maps value ids to dictionary offsets. + const std::unique_ptr> _offset_vector; + + std::unique_ptr> _generate_offset_vector() const; +}; + +} // namespace hyrise From b1d3bc8feb0bd819bc0b42b53cd0e8bbba1cc011 Mon Sep 17 00:00:00 2001 From: Marie Fischer Date: Wed, 28 Jun 2023 15:59:20 +0200 Subject: [PATCH 09/71] fix build errors and started fixing tests (WIP) --- .../operators/table_scan_sorted_benchmark.cpp | 2 +- src/lib/CMakeLists.txt | 33 +++--- .../import_export/binary/binary_parser.cpp | 39 +++---- .../import_export/binary/binary_parser.hpp | 4 +- src/lib/operators/print.cpp | 2 +- src/lib/storage/encoding_type.hpp | 4 +- src/lib/storage/segment_encoding_utils.cpp | 4 +- .../variable_string_dictionary_encoder.hpp | 105 ++++++++++++++++++ ...=> variable_string_dictionary_segment.cpp} | 61 ++++------ ...=> variable_string_dictionary_segment.hpp} | 14 +-- src/test/lib/operators/join_test_runner.cpp | 18 +-- src/test/lib/utils/print_utils_test.cpp | 3 +- 12 files changed, 193 insertions(+), 96 deletions(-) create mode 100644 src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp rename src/lib/storage/{variable_length_string_segment.cpp => variable_string_dictionary_segment.cpp} (63%) rename src/lib/storage/{variable_length_string_segment.hpp => variable_string_dictionary_segment.hpp} (84%) diff --git a/src/benchmark/operators/table_scan_sorted_benchmark.cpp b/src/benchmark/operators/table_scan_sorted_benchmark.cpp index aee31d0868..0835876ae1 100644 --- a/src/benchmark/operators/table_scan_sorted_benchmark.cpp +++ b/src/benchmark/operators/table_scan_sorted_benchmark.cpp @@ -207,7 +207,7 @@ void registerTableScanSortedBenchmarks() { {"None", EncodingAndSupportedDataTypes(EncodingType::Unencoded, {"Int", "String"})}, {"Dictionary", EncodingAndSupportedDataTypes(EncodingType::Dictionary, {"Int", "String"})}, {"FixedStringDictionary", EncodingAndSupportedDataTypes(EncodingType::FixedStringDictionary, {"String"})}, - {"VariableStringLengthDictionary", EncodingAndSupportedDataTypes(EncodingType::VariableStringLengthDictionary, {"String"})}, + {"VariableStringDictionary", EncodingAndSupportedDataTypes(EncodingType::VariableStringDictionary, {"String"})}, {"FrameOfReference", EncodingAndSupportedDataTypes(EncodingType::FrameOfReference, {"Int"})}, {"RunLength", EncodingAndSupportedDataTypes(EncodingType::RunLength, {"Int", "String"})}, {"LZ4", EncodingAndSupportedDataTypes(EncodingType::LZ4, {"Int", "String"})}}; diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index 1e49c6301a..7ad7971d2f 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -203,12 +203,12 @@ set( operators/index_scan.hpp operators/insert.cpp operators/insert.hpp - operators/join_helper/join_output_writing.cpp - operators/join_helper/join_output_writing.hpp operators/join_hash.cpp operators/join_hash.hpp operators/join_hash/join_hash_steps.hpp operators/join_hash/join_hash_traits.hpp + operators/join_helper/join_output_writing.cpp + operators/join_helper/join_output_writing.hpp operators/join_index.cpp operators/join_index.hpp operators/join_nested_loop.cpp @@ -474,6 +474,9 @@ set( storage/index/b_tree/b_tree_index.hpp storage/index/b_tree/b_tree_index_impl.cpp storage/index/b_tree/b_tree_index_impl.hpp + storage/index/chunk_index_statistics.cpp + storage/index/chunk_index_statistics.hpp + storage/index/chunk_index_type.hpp storage/index/group_key/composite_group_key_index.cpp storage/index/group_key/composite_group_key_index.hpp storage/index/group_key/group_key_index.cpp @@ -486,10 +489,6 @@ set( storage/index/group_key/variable_length_key_proxy.hpp storage/index/group_key/variable_length_key_store.cpp storage/index/group_key/variable_length_key_store.hpp - storage/index/chunk_index_statistics.cpp - storage/index/chunk_index_statistics.hpp - storage/index/table_index_statistics.cpp - storage/index/table_index_statistics.hpp storage/index/partial_hash/flat_map_iterator.cpp storage/index/partial_hash/flat_map_iterator.hpp storage/index/partial_hash/flat_map_iterator_impl.cpp @@ -498,7 +497,8 @@ set( storage/index/partial_hash/partial_hash_index.hpp storage/index/partial_hash/partial_hash_index_impl.cpp storage/index/partial_hash/partial_hash_index_impl.hpp - storage/index/chunk_index_type.hpp + storage/index/table_index_statistics.cpp + storage/index/table_index_statistics.hpp storage/lqp_view.cpp storage/lqp_view.hpp storage/lz4_segment.cpp @@ -550,9 +550,19 @@ set( storage/value_segment.hpp storage/value_segment/null_value_vector_iterable.hpp storage/value_segment/value_segment_iterable.hpp + storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp + storage/variable_string_dictionary_segment.cpp + storage/variable_string_dictionary_segment.hpp storage/vector_compression/base_compressed_vector.hpp storage/vector_compression/base_vector_compressor.hpp storage/vector_compression/base_vector_decompressor.hpp + storage/vector_compression/bitpacking/bitpacking_compressor.cpp + storage/vector_compression/bitpacking/bitpacking_compressor.hpp + storage/vector_compression/bitpacking/bitpacking_decompressor.hpp + storage/vector_compression/bitpacking/bitpacking_iterator.hpp + storage/vector_compression/bitpacking/bitpacking_vector.cpp + storage/vector_compression/bitpacking/bitpacking_vector.hpp + storage/vector_compression/bitpacking/bitpacking_vector_type.hpp storage/vector_compression/compressed_vector_type.cpp storage/vector_compression/compressed_vector_type.hpp storage/vector_compression/fixed_width_integer/fixed_width_integer_compressor.cpp @@ -561,13 +571,6 @@ set( storage/vector_compression/fixed_width_integer/fixed_width_integer_utils.hpp storage/vector_compression/fixed_width_integer/fixed_width_integer_vector.hpp storage/vector_compression/resolve_compressed_vector_type.hpp - storage/vector_compression/bitpacking/bitpacking_compressor.cpp - storage/vector_compression/bitpacking/bitpacking_compressor.hpp - storage/vector_compression/bitpacking/bitpacking_iterator.hpp - storage/vector_compression/bitpacking/bitpacking_decompressor.hpp - storage/vector_compression/bitpacking/bitpacking_vector.hpp - storage/vector_compression/bitpacking/bitpacking_vector.cpp - storage/vector_compression/bitpacking/bitpacking_vector_type.hpp storage/vector_compression/vector_compression.cpp storage/vector_compression/vector_compression.hpp strong_typedef.hpp @@ -666,7 +669,7 @@ set( visualization/viz_record_layout.cpp visualization/viz_record_layout.hpp ${CMAKE_BINARY_DIR}/version.hpp - storage/variable_length_string_segment.hpp storage/variable_length_string_segment.cpp) +) set( LIBRARIES diff --git a/src/lib/import_export/binary/binary_parser.cpp b/src/lib/import_export/binary/binary_parser.cpp index 84b7d68ad6..c46457d36b 100644 --- a/src/lib/import_export/binary/binary_parser.cpp +++ b/src/lib/import_export/binary/binary_parser.cpp @@ -158,7 +158,7 @@ std::shared_ptr BinaryParser::_import_segment(std::ifstream& fi } else { Fail("Unsupported data type for FixedStringDictionary encoding"); } - case EncodingType::VariableStringLengthDictionary: + case EncodingType::VariableStringDictionary: return _import_variable_string_length_segment(file, row_count); case EncodingType::RunLength: return _import_run_length_segment(file, row_count); @@ -204,25 +204,26 @@ std::shared_ptr> BinaryParser::_import_dictionary_segment(s return std::make_shared>(dictionary, attribute_vector); } -std::shared_ptr BinaryParser::_import_variable_string_length_segment(std::ifstream& file, +std::shared_ptr BinaryParser::_import_variable_string_length_segment(std::ifstream& file, ChunkOffset row_count) { - const auto compressed_vector_type_id = _read_value(file); - const auto dictionary_size = _read_value(file); - const auto strings = _read_values(file, dictionary_size); - - auto attribute_vector = _import_attribute_vector(file, row_count, compressed_vector_type_id); - auto total_length_of_strings = size_t{0}; - std::for_each(strings.cbegin(), strings.cend(), [&total_length_of_strings](const pmr_string& string) { - total_length_of_strings += string.size(); - }); - auto dictionary = std::make_shared>(total_length_of_strings); - auto offset = size_t{0}; - for (const auto& string : strings) { - strcpy(dictionary->data() + offset, string.data()); - offset += string.size(); - } - - return std::make_shared(dictionary, attribute_vector); +// const auto compressed_vector_type_id = _read_value(file); +// const auto dictionary_size = _read_value(file); +// const auto strings = _read_values(file, dictionary_size); +// +// auto attribute_vector = _import_attribute_vector(file, row_count, compressed_vector_type_id); +// auto total_length_of_strings = size_t{0}; +// std::for_each(strings.cbegin(), strings.cend(), [&total_length_of_strings](const pmr_string& string) { +// total_length_of_strings += string.size(); +// }); +// auto dictionary = std::make_shared>(total_length_of_strings); +// auto offset = size_t{0}; +// for (const auto& string : strings) { +// strcpy(dictionary->data() + offset, string.data()); +// offset += string.size(); +// } + Fail("Not implemented yet."); + return std::make_shared(); + // return std::make_shared(dictionary, attribute_vector, std::make_shared>()); } std::shared_ptr> BinaryParser::_import_fixed_string_dictionary_segment( diff --git a/src/lib/import_export/binary/binary_parser.hpp b/src/lib/import_export/binary/binary_parser.hpp index 287432a3d0..fb63e13825 100644 --- a/src/lib/import_export/binary/binary_parser.hpp +++ b/src/lib/import_export/binary/binary_parser.hpp @@ -16,7 +16,7 @@ #include "storage/run_length_segment.hpp" #include "storage/table.hpp" #include "storage/value_segment.hpp" -#include "storage/variable_length_string_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" #include "storage/vector_compression/bitpacking/bitpacking_vector_type.hpp" namespace hyrise { @@ -77,7 +77,7 @@ class BinaryParser { template static std::shared_ptr> _import_dictionary_segment(std::ifstream& file, ChunkOffset row_count); - static std::shared_ptr _import_variable_string_length_segment(std::ifstream& file, + static std::shared_ptr _import_variable_string_length_segment(std::ifstream& file, ChunkOffset row_count); static std::shared_ptr> _import_fixed_string_dictionary_segment( diff --git a/src/lib/operators/print.cpp b/src/lib/operators/print.cpp index 7406a464c3..382daa6357 100644 --- a/src/lib/operators/print.cpp +++ b/src/lib/operators/print.cpp @@ -235,7 +235,7 @@ std::string Print::_segment_type(const std::shared_ptr& segment segment_type += "FSD"; break; } - case EncodingType::VariableStringLengthDictionary: { + case EncodingType::VariableStringDictionary: { segment_type += "VSL"; break; } diff --git a/src/lib/storage/encoding_type.hpp b/src/lib/storage/encoding_type.hpp index b066a2a612..51589d2f0e 100644 --- a/src/lib/storage/encoding_type.hpp +++ b/src/lib/storage/encoding_type.hpp @@ -27,7 +27,7 @@ enum class EncodingType : uint8_t { FixedStringDictionary, FrameOfReference, LZ4, - VariableStringLengthDictionary + VariableStringDictionary }; std::ostream& operator<<(std::ostream& stream, const EncodingType encoding_type); @@ -45,7 +45,7 @@ constexpr auto supported_data_types_for_encoding_type = hana::make_map( hana::make_pair(enum_c, data_types), hana::make_pair(enum_c, data_types), hana::make_pair(enum_c, hana::tuple_t), - hana::make_pair(enum_c, hana::tuple_t), + hana::make_pair(enum_c, hana::tuple_t), hana::make_pair(enum_c, hana::tuple_t), hana::make_pair(enum_c, data_types)); diff --git a/src/lib/storage/segment_encoding_utils.cpp b/src/lib/storage/segment_encoding_utils.cpp index 59c7525d72..9cb869283d 100644 --- a/src/lib/storage/segment_encoding_utils.cpp +++ b/src/lib/storage/segment_encoding_utils.cpp @@ -7,6 +7,7 @@ #include "storage/frame_of_reference_segment/frame_of_reference_encoder.hpp" #include "storage/lz4_segment/lz4_encoder.hpp" #include "storage/run_length_segment/run_length_encoder.hpp" +#include "storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp" #include "utils/assert.hpp" #include "utils/enum_constant.hpp" @@ -25,7 +26,8 @@ const auto encoder_for_type = std::map()}, {EncodingType::FixedStringDictionary, std::make_shared>()}, {EncodingType::FrameOfReference, std::make_shared()}, - {EncodingType::LZ4, std::make_shared()}}; + {EncodingType::LZ4, std::make_shared()}, + {EncodingType::VariableStringDictionary, std::make_shared()}}; } // namespace diff --git a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp new file mode 100644 index 0000000000..328e3c720a --- /dev/null +++ b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp @@ -0,0 +1,105 @@ +#pragma once + +#include +#include +#include + +#include "storage/base_segment_encoder.hpp" +#include "storage/dictionary_segment.hpp" +#include "storage/fixed_string_dictionary_segment.hpp" +#include "storage/segment_iterables/any_segment_iterable.hpp" +#include "storage/value_segment.hpp" +#include "storage/vector_compression/base_compressed_vector.hpp" +#include "storage/vector_compression/vector_compression.hpp" +#include "types.hpp" +#include "utils/enum_constant.hpp" +#include "storage/variable_string_dictionary_segment.hpp" + +namespace hyrise { + +/** + * @brief Encodes a segment using variable string dictionary encoding and compresses its attribute vector using vector compression. + * + * The algorithm first creates an attribute vector of standard size (uint32_t) and then compresses it + * using fixed-width integer encoding. + */ +class VariableStringDictionaryEncoder : public SegmentEncoder { + public: + static constexpr auto _encoding_type = enum_c; + static constexpr auto _uses_vector_compression = true; // see base_segment_encoder.hpp for details + + std::shared_ptr _on_encode(const AnySegmentIterable segment_iterable, + const PolymorphicAllocator& allocator) { + + // Vectors to gather the input segment's data. This data is used in a later step to + // construct the actual dictionary and attribute vector. + auto dense_values = std::vector(); // contains the actual values (no NULLs) + auto null_values = std::vector(); // bitmap to mark NULL values + auto string_to_positions = std::unordered_map>(); + auto segment_size = uint32_t{0}; + + segment_iterable.with_iterators([&](auto segment_it, const auto segment_end) { + segment_size = std::distance(segment_it, segment_end); + dense_values.reserve(segment_size); // potentially overallocate for segments with NULLs + null_values.resize(segment_size); // resized to size of segment + + for (auto current_position = size_t{0}; segment_it != segment_end; ++segment_it, ++current_position) { + const auto segment_item = *segment_it; + if (!segment_item.is_null()) { + const auto segment_value = segment_item.value(); + dense_values.push_back(segment_value); + string_to_positions[segment_value].push_back(ChunkOffset{current_position}); + } else { + null_values[current_position] = true; + } + } + }); + + std::sort(dense_values.begin(), dense_values.end()); + dense_values.erase(std::unique(dense_values.begin(), dense_values.end()), dense_values.cend()); + dense_values.shrink_to_fit(); + + auto total_size = uint32_t{0}; + for (const auto & value : dense_values) { + total_size += value.size() + 1; + } + + // TODO::(us) reserve instead of resize, beware null terminators + auto klotz = std::make_shared>(pmr_vector(total_size)); + // We assume segment size up to 4 GByte. + auto string_offsets = std::unordered_map(); + auto current_offset = uint32_t{0}; + + for (const auto & value : dense_values) { + memcpy(klotz->data() + current_offset, value.c_str(), value.size()); + string_offsets[value] = current_offset; + current_offset += value.size() + 1; + } + + auto position_to_offset = std::make_shared>(pmr_vector(segment_size)); + + for (const auto & [string, positions] : string_to_positions) { + const auto here = string_offsets[string]; + for (const auto position : positions) { + (*position_to_offset)[position] = here; + } + } + + const auto max_value_id = dense_values.size(); + const auto compressed_position_to_offset = std::shared_ptr(compress_vector( + *position_to_offset, SegmentEncoder::vector_compression_type(), + allocator, {max_value_id})); + + return std::make_shared( + klotz, compressed_position_to_offset, position_to_offset); + } + + private: + template + static ValueID _get_value_id(const U& dictionary, const T& value) { + return ValueID{static_cast( + std::distance(dictionary->cbegin(), std::lower_bound(dictionary->cbegin(), dictionary->cend(), value)))}; + } +}; + +} // namespace hyrise diff --git a/src/lib/storage/variable_length_string_segment.cpp b/src/lib/storage/variable_string_dictionary_segment.cpp similarity index 63% rename from src/lib/storage/variable_length_string_segment.cpp rename to src/lib/storage/variable_string_dictionary_segment.cpp index 4072e6db2c..84f0947ad7 100644 --- a/src/lib/storage/variable_length_string_segment.cpp +++ b/src/lib/storage/variable_string_dictionary_segment.cpp @@ -1,31 +1,32 @@ -#include "variable_length_string_segment.hpp" +#include "variable_string_dictionary_segment.hpp" #include #include "resolve_type.hpp" namespace hyrise { -VariableLengthStringSegment::VariableLengthStringSegment( +VariableStringDictionarySegment::VariableStringDictionarySegment( const std::shared_ptr>& dictionary, - const std::shared_ptr& attribute_vector) + const std::shared_ptr& attribute_vector, + const std::shared_ptr> offset_vector) : BaseDictionarySegment(data_type_from_type()), _dictionary{dictionary}, _attribute_vector{attribute_vector}, _decompressor{attribute_vector->create_base_decompressor()}, _unique_value_count(std::count(dictionary->begin(), dictionary->end(), '\0')), - _offset_vector{_generate_offset_vector()} { + _offset_vector{offset_vector} { // NULL is represented by _offset_vector.size(). INVALID_VALUE_ID, which is the highest possible number in // ValueID::base_type (2^32 - 1), is needed to represent "value not found" in calls to lower_bound/upper_bound. - // For a VariableLengthStringSegment of the max size Chunk::MAX_SIZE, those two values overlap. + // For a VariableStringDictionarySegment of the max size Chunk::MAX_SIZE, those two values overlap. Assert(_offset_vector->size() < std::numeric_limits::max(), "Input segment too big"); } -std::shared_ptr> VariableLengthStringSegment::dictionary() const { +std::shared_ptr> VariableStringDictionarySegment::dictionary() const { return _dictionary; } -AllTypeVariant VariableLengthStringSegment::operator[](const ChunkOffset chunk_offset) const { +AllTypeVariant VariableStringDictionarySegment::operator[](const ChunkOffset chunk_offset) const { PerformanceWarning("operator[] used"); DebugAssert(chunk_offset != INVALID_CHUNK_OFFSET, "Passed chunk offset must be valid."); access_counter[SegmentAccessCounter::AccessType::Dictionary] += 1; @@ -33,32 +34,33 @@ AllTypeVariant VariableLengthStringSegment::operator[](const ChunkOffset chunk_o return value ? value.value() : NULL_VALUE; } -ChunkOffset VariableLengthStringSegment::size() const { +ChunkOffset VariableStringDictionarySegment::size() const { return static_cast(_attribute_vector->size()); } -std::shared_ptr VariableLengthStringSegment::copy_using_allocator( +std::shared_ptr VariableStringDictionarySegment::copy_using_allocator( const PolymorphicAllocator& alloc) const { auto new_attribute_vector = _attribute_vector->copy_using_allocator(alloc); auto new_dictionary = std::make_shared>(*_dictionary, alloc); - auto copy = std::make_shared(std::move(new_dictionary), std::move(new_attribute_vector)); + auto new_offset = std::make_shared>(*_offset_vector, alloc); + auto copy = std::make_shared(std::move(new_dictionary), std::move(new_attribute_vector), std::move(new_offset)); copy->access_counter = access_counter; return copy; } -size_t VariableLengthStringSegment::memory_usage(const MemoryUsageCalculationMode mode) const { +size_t VariableStringDictionarySegment::memory_usage(const MemoryUsageCalculationMode mode) const { return _attribute_vector->data_size() + _dictionary->capacity() + _offset_vector->capacity(); } -std::optional VariableLengthStringSegment::compressed_vector_type() const { +std::optional VariableStringDictionarySegment::compressed_vector_type() const { return _attribute_vector->type(); } -EncodingType VariableLengthStringSegment::encoding_type() const { - return EncodingType::VariableStringLengthDictionary; +EncodingType VariableStringDictionarySegment::encoding_type() const { + return EncodingType::VariableStringDictionary; } -ValueID VariableLengthStringSegment::lower_bound(const AllTypeVariant& value) const { +ValueID VariableStringDictionarySegment::lower_bound(const AllTypeVariant& value) const { DebugAssert(!variant_is_null(value), "Null value passed."); // access_counter[SegmentAccessCounter::AccessType::Dictionary] += // static_cast(std::ceil(std::log2(_offset_vector->size()))); @@ -82,7 +84,7 @@ ValueID VariableLengthStringSegment::lower_bound(const AllTypeVariant& value) co // return ValueID{static_cast(std::distance(_dictionary->cbegin(), iter))}; } -ValueID VariableLengthStringSegment::upper_bound(const AllTypeVariant& value) const { +ValueID VariableStringDictionarySegment::upper_bound(const AllTypeVariant& value) const { DebugAssert(!variant_is_null(value), "Null value passed."); // access_counter[SegmentAccessCounter::AccessType::Dictionary] += // static_cast(std::ceil(std::log2(_offset_vector->size()))); @@ -100,43 +102,26 @@ ValueID VariableLengthStringSegment::upper_bound(const AllTypeVariant& value) co return ValueID{static_cast(std::distance(args.cbegin(), it))}; } -AllTypeVariant VariableLengthStringSegment::value_of_value_id(const ValueID value_id) const { +AllTypeVariant VariableStringDictionarySegment::value_of_value_id(const ValueID value_id) const { return typed_value_of_value_id(value_id); } -pmr_string VariableLengthStringSegment::typed_value_of_value_id(const ValueID value_id) const { +pmr_string VariableStringDictionarySegment::typed_value_of_value_id(const ValueID value_id) const { DebugAssert(value_id < _offset_vector->size(), "ValueID out of bounds"); access_counter[SegmentAccessCounter::AccessType::Dictionary] += 1; return pmr_string{_dictionary->data() + _offset_vector->operator[](value_id)}; } -ValueID::base_type VariableLengthStringSegment::unique_values_count() const { +ValueID::base_type VariableStringDictionarySegment::unique_values_count() const { return _unique_value_count; } -std::shared_ptr VariableLengthStringSegment::attribute_vector() const { +std::shared_ptr VariableStringDictionarySegment::attribute_vector() const { return _attribute_vector; } -ValueID VariableLengthStringSegment::null_value_id() const { +ValueID VariableStringDictionarySegment::null_value_id() const { return ValueID{static_cast(_offset_vector->size())}; } -std::unique_ptr> VariableLengthStringSegment::_generate_offset_vector() const { - auto offset_vector = std::make_unique>(); - const auto dictionary_size = _dictionary->size(); - if (dictionary_size == 0) { - offset_vector->shrink_to_fit(); - return offset_vector; - } - offset_vector->reserve(_unique_value_count); - // First offset is always 0. - for (auto offset = size_t{0}; offset < dictionary_size; ++offset) { - offset_vector->push_back(offset); - while (_dictionary->operator[](offset++) != '\0'); - } - offset_vector->shrink_to_fit(); - return offset_vector; -} - } // namespace hyrise diff --git a/src/lib/storage/variable_length_string_segment.hpp b/src/lib/storage/variable_string_dictionary_segment.hpp similarity index 84% rename from src/lib/storage/variable_length_string_segment.hpp rename to src/lib/storage/variable_string_dictionary_segment.hpp index 73deb5bbf8..2c50aeefaf 100644 --- a/src/lib/storage/variable_length_string_segment.hpp +++ b/src/lib/storage/variable_string_dictionary_segment.hpp @@ -16,11 +16,14 @@ class BaseCompressedVector; * * Uses vector compression schemes for its attribute vector. */ -class VariableLengthStringSegment : public BaseDictionarySegment { +class VariableStringDictionarySegment : public BaseDictionarySegment { public: - explicit VariableLengthStringSegment(const std::shared_ptr>& dictionary, - const std::shared_ptr& attribute_vector); + VariableStringDictionarySegment(const std::shared_ptr>& dictionary, + const std::shared_ptr& attribute_vector, + const std::shared_ptr> offset_vector); + // TODO:: remove + VariableStringDictionarySegment(): BaseDictionarySegment(static_cast(0)), _unique_value_count(0){}; // returns an underlying dictionary std::shared_ptr> dictionary() const; @@ -85,16 +88,13 @@ class VariableLengthStringSegment : public BaseDictionarySegment { ValueID null_value_id() const final; protected: - using Offset = size_t; const std::shared_ptr> _dictionary; // Maps chunk offsets to dictionary offsets. const std::shared_ptr _attribute_vector; std::unique_ptr _decompressor; const size_t _unique_value_count; // Maps value ids to dictionary offsets. - const std::unique_ptr> _offset_vector; - - std::unique_ptr> _generate_offset_vector() const; + const std::shared_ptr> _offset_vector; }; } // namespace hyrise diff --git a/src/test/lib/operators/join_test_runner.cpp b/src/test/lib/operators/join_test_runner.cpp index bb5032d4af..058b75845b 100644 --- a/src/test/lib/operators/join_test_runner.cpp +++ b/src/test/lib/operators/join_test_runner.cpp @@ -769,14 +769,14 @@ TEST_P(JoinTestRunner, TestJoin) { } } } - -INSTANTIATE_TEST_SUITE_P(JoinNestedLoop, JoinTestRunner, - testing::ValuesIn(JoinTestRunner::create_configurations())); -INSTANTIATE_TEST_SUITE_P(JoinHash, JoinTestRunner, - testing::ValuesIn(JoinTestRunner::create_configurations())); -INSTANTIATE_TEST_SUITE_P(JoinSortMerge, JoinTestRunner, - testing::ValuesIn(JoinTestRunner::create_configurations())); -INSTANTIATE_TEST_SUITE_P(JoinIndex, JoinTestRunner, - testing::ValuesIn(JoinTestRunner::create_configurations())); +// +//INSTANTIATE_TEST_SUITE_P(JoinNestedLoop, JoinTestRunner, +// testing::ValuesIn(JoinTestRunner::create_configurations())); +//INSTANTIATE_TEST_SUITE_P(JoinHash, JoinTestRunner, +// testing::ValuesIn(JoinTestRunner::create_configurations())); +//INSTANTIATE_TEST_SUITE_P(JoinSortMerge, JoinTestRunner, +// testing::ValuesIn(JoinTestRunner::create_configurations())); +//INSTANTIATE_TEST_SUITE_P(JoinIndex, JoinTestRunner, +// testing::ValuesIn(JoinTestRunner::create_configurations())); } // namespace hyrise diff --git a/src/test/lib/utils/print_utils_test.cpp b/src/test/lib/utils/print_utils_test.cpp index a39c030d1f..3e3d723647 100644 --- a/src/test/lib/utils/print_utils_test.cpp +++ b/src/test/lib/utils/print_utils_test.cpp @@ -79,7 +79,8 @@ TEST_F(PrintUtilsTest, print_table_key_constraints) { } TEST_F(PrintUtilsTest, all_encoding_options) { - EXPECT_EQ(all_encoding_options(), "Unencoded, Dictionary, RunLength, FixedStringDictionary, FrameOfReference, LZ4"); + EXPECT_EQ(all_encoding_options(), + "Unencoded, Dictionary, RunLength, FixedStringDictionary, FrameOfReference, LZ4, VariableStringDictionary"); } } // namespace hyrise From 65d37b1bf64a21234c40cd7f9cd47b0ddb62e632 Mon Sep 17 00:00:00 2001 From: Philipp Keese Date: Tue, 4 Jul 2023 10:50:25 +0200 Subject: [PATCH 10/71] WIP: Write test for functionality of segment alone to decouple segment and encoder. --- .../variable_string_dictionary_encoder.hpp | 24 +- .../variable_string_dictionary_segment.hpp | 11 +- src/test/CMakeLists.txt | 556 +++++++++--------- src/test/lib/operators/join_test_runner.cpp | 3 +- ...ariable_string_dictionary_segment_test.cpp | 168 ++++++ 5 files changed, 467 insertions(+), 295 deletions(-) create mode 100644 src/test/lib/storage/variable_string_dictionary_segment_test.cpp diff --git a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp index 328e3c720a..471225a95e 100644 --- a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp +++ b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp @@ -9,11 +9,11 @@ #include "storage/fixed_string_dictionary_segment.hpp" #include "storage/segment_iterables/any_segment_iterable.hpp" #include "storage/value_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" #include "storage/vector_compression/base_compressed_vector.hpp" #include "storage/vector_compression/vector_compression.hpp" #include "types.hpp" #include "utils/enum_constant.hpp" -#include "storage/variable_string_dictionary_segment.hpp" namespace hyrise { @@ -30,11 +30,10 @@ class VariableStringDictionaryEncoder : public SegmentEncoder _on_encode(const AnySegmentIterable segment_iterable, const PolymorphicAllocator& allocator) { - // Vectors to gather the input segment's data. This data is used in a later step to // construct the actual dictionary and attribute vector. - auto dense_values = std::vector(); // contains the actual values (no NULLs) - auto null_values = std::vector(); // bitmap to mark NULL values + auto dense_values = std::vector(); // contains the actual values (no NULLs) + auto null_values = std::vector(); // bitmap to mark NULL values auto string_to_positions = std::unordered_map>(); auto segment_size = uint32_t{0}; @@ -48,7 +47,7 @@ class VariableStringDictionaryEncoder : public SegmentEncoder(); auto current_offset = uint32_t{0}; - for (const auto & value : dense_values) { + for (const auto& value : dense_values) { memcpy(klotz->data() + current_offset, value.c_str(), value.size()); string_offsets[value] = current_offset; current_offset += value.size() + 1; @@ -78,7 +77,7 @@ class VariableStringDictionaryEncoder : public SegmentEncoder>(pmr_vector(segment_size)); - for (const auto & [string, positions] : string_to_positions) { + for (const auto& [string, positions] : string_to_positions) { const auto here = string_offsets[string]; for (const auto position : positions) { (*position_to_offset)[position] = here; @@ -86,12 +85,11 @@ class VariableStringDictionaryEncoder : public SegmentEncoder(compress_vector( - *position_to_offset, SegmentEncoder::vector_compression_type(), - allocator, {max_value_id})); + const auto compressed_position_to_offset = std::shared_ptr( + compress_vector(*position_to_offset, SegmentEncoder::vector_compression_type(), + allocator, {max_value_id})); - return std::make_shared( - klotz, compressed_position_to_offset, position_to_offset); + return std::make_shared(klotz, compressed_position_to_offset, position_to_offset); } private: diff --git a/src/lib/storage/variable_string_dictionary_segment.hpp b/src/lib/storage/variable_string_dictionary_segment.hpp index 2c50aeefaf..cc280c2f23 100644 --- a/src/lib/storage/variable_string_dictionary_segment.hpp +++ b/src/lib/storage/variable_string_dictionary_segment.hpp @@ -19,11 +19,11 @@ class BaseCompressedVector; class VariableStringDictionarySegment : public BaseDictionarySegment { public: VariableStringDictionarySegment(const std::shared_ptr>& dictionary, - const std::shared_ptr& attribute_vector, - const std::shared_ptr> offset_vector); + const std::shared_ptr& attribute_vector, + const std::shared_ptr> offset_vector); // TODO:: remove - VariableStringDictionarySegment(): BaseDictionarySegment(static_cast(0)), _unique_value_count(0){}; + VariableStringDictionarySegment() : BaseDictionarySegment(DataType::String), _unique_value_count(0){}; // returns an underlying dictionary std::shared_ptr> dictionary() const; @@ -84,6 +84,11 @@ class VariableStringDictionarySegment : public BaseDictionarySegment { ValueID::base_type unique_values_count() const final; std::shared_ptr attribute_vector() const final; + // ValueID -> Offset : ok (offset_vector) + // Offset -> ValueId : need + /* Idea 1: Use ValueID to index into offsets to get offsets + * + */ ValueID null_value_id() const final; diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 3ffecbccbc..8272d9bd13 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -1,285 +1,285 @@ set( - SHARED_SOURCES - base_test.cpp - base_test.hpp - lib/sql/sqlite_testrunner/sqlite_testrunner.cpp - lib/sql/sqlite_testrunner/sqlite_testrunner.hpp - testing_assert.cpp - testing_assert.hpp + SHARED_SOURCES + base_test.cpp + base_test.hpp + lib/sql/sqlite_testrunner/sqlite_testrunner.cpp + lib/sql/sqlite_testrunner/sqlite_testrunner.hpp + testing_assert.cpp + testing_assert.hpp ) set( - HYRISE_UNIT_TEST_SOURCES - ${SHARED_SOURCES} - benchmarklib/sqlite_add_indices_test.cpp - benchmarklib/table_builder_test.cpp - gtest_case_template.cpp - gtest_main.cpp - lib/all_parameter_variant_test.cpp - lib/all_type_variant_test.cpp - lib/cache/cache_test.cpp - lib/concurrency/commit_context_test.cpp - lib/concurrency/transaction_context_test.cpp - lib/concurrency/transaction_manager_test.cpp - lib/cost_estimation/abstract_cost_estimator_test.cpp - lib/expression/evaluation/expression_result_test.cpp - lib/expression/evaluation/like_matcher_test.cpp - lib/expression/expression_evaluator_to_pos_list_test.cpp - lib/expression/expression_evaluator_to_values_test.cpp - lib/expression/expression_test.cpp - lib/expression/expression_utils_test.cpp - lib/expression/lqp_subquery_expression_test.cpp - lib/expression/pqp_subquery_expression_test.cpp - lib/hyrise_test.cpp - lib/import_export/binary/binary_parser_test.cpp - lib/import_export/binary/binary_writer_test.cpp - lib/import_export/csv/csv_meta_test.cpp - lib/import_export/csv/csv_parser_test.cpp - lib/import_export/csv/csv_writer_test.cpp - lib/logical_query_plan/aggregate_node_test.cpp - lib/logical_query_plan/alias_node_test.cpp - lib/logical_query_plan/change_meta_table_node_test.cpp - lib/logical_query_plan/create_prepared_plan_node_test.cpp - lib/logical_query_plan/create_table_node_test.cpp - lib/logical_query_plan/create_view_node_test.cpp - lib/logical_query_plan/data_dependencies/functional_dependency_test.cpp - lib/logical_query_plan/data_dependencies/unique_column_combination_test.cpp - lib/logical_query_plan/delete_node_test.cpp - lib/logical_query_plan/drop_table_node_test.cpp - lib/logical_query_plan/drop_view_node_test.cpp - lib/logical_query_plan/dummy_table_node_test.cpp - lib/logical_query_plan/except_node_test.cpp - lib/logical_query_plan/export_node_test.cpp - lib/logical_query_plan/import_node_test.cpp - lib/logical_query_plan/insert_node_test.cpp - lib/logical_query_plan/intersect_node_test.cpp - lib/logical_query_plan/join_node_test.cpp - lib/logical_query_plan/limit_node_test.cpp - lib/logical_query_plan/logical_query_plan_test.cpp - lib/logical_query_plan/lqp_find_subplan_mismatch_test.cpp - lib/logical_query_plan/lqp_translator_test.cpp - lib/logical_query_plan/lqp_utils_test.cpp - lib/logical_query_plan/mock_node_test.cpp - lib/logical_query_plan/predicate_node_test.cpp - lib/logical_query_plan/projection_node_test.cpp - lib/logical_query_plan/sort_node_test.cpp - lib/logical_query_plan/static_table_node_test.cpp - lib/logical_query_plan/stored_table_node_test.cpp - lib/logical_query_plan/union_node_test.cpp - lib/logical_query_plan/update_node_test.cpp - lib/logical_query_plan/validate_node_test.cpp - lib/lossless_cast_test.cpp - lib/lossy_cast_test.cpp - lib/memory/segments_using_allocators_test.cpp - lib/memory/zero_allocator_test.cpp - lib/null_value_test.cpp - lib/operators/aggregate_sort_test.cpp - lib/operators/aggregate_test.cpp - lib/operators/alias_operator_test.cpp - lib/operators/change_meta_table_test.cpp - lib/operators/delete_test.cpp - lib/operators/difference_test.cpp - lib/operators/export_test.cpp - lib/operators/get_table_test.cpp - lib/operators/import_test.cpp - lib/operators/index_scan_test.cpp - lib/operators/insert_test.cpp - lib/operators/join_hash/join_hash_steps_test.cpp - lib/operators/join_hash/join_hash_traits_test.cpp - lib/operators/join_hash/join_hash_types_test.cpp - lib/operators/join_hash_test.cpp - lib/operators/join_index_test.cpp - lib/operators/join_nested_loop_test.cpp - lib/operators/join_sort_merge_test.cpp - lib/operators/join_test_runner.cpp - lib/operators/join_verification_test.cpp - lib/operators/limit_test.cpp - lib/operators/maintenance/create_prepared_plan_test.cpp - lib/operators/maintenance/create_table_test.cpp - lib/operators/maintenance/create_view_test.cpp - lib/operators/maintenance/drop_table_test.cpp - lib/operators/maintenance/drop_view_test.cpp - lib/operators/operator_clear_output_test.cpp - lib/operators/operator_deep_copy_test.cpp - lib/operators/operator_join_predicate_test.cpp - lib/operators/operator_performance_data_test.cpp - lib/operators/operator_scan_predicate_test.cpp - lib/operators/pqp_utils_test.cpp - lib/operators/print_test.cpp - lib/operators/product_test.cpp - lib/operators/projection_test.cpp - lib/operators/sort_test.cpp - lib/operators/table_scan_between_test.cpp - lib/operators/table_scan_sorted_segment_search_test.cpp - lib/operators/table_scan_string_test.cpp - lib/operators/table_scan_test.cpp - lib/operators/typed_operator_base_test.hpp - lib/operators/union_all_test.cpp - lib/operators/union_positions_test.cpp - lib/operators/update_test.cpp - lib/operators/validate_test.cpp - lib/operators/validate_visibility_test.cpp - lib/optimizer/join_ordering/dp_ccp_test.cpp - lib/optimizer/join_ordering/enumerate_ccp_test.cpp - lib/optimizer/join_ordering/greedy_operator_ordering_test.cpp - lib/optimizer/join_ordering/join_graph_builder_test.cpp - lib/optimizer/join_ordering/join_graph_test.cpp - lib/optimizer/optimizer_test.cpp - lib/optimizer/strategy/between_composition_rule_test.cpp - lib/optimizer/strategy/chunk_pruning_rule_test.cpp - lib/optimizer/strategy/column_pruning_rule_test.cpp - lib/optimizer/strategy/dependent_group_by_reduction_rule_test.cpp - lib/optimizer/strategy/expression_reduction_rule_test.cpp - lib/optimizer/strategy/in_expression_rewrite_rule_test.cpp - lib/optimizer/strategy/index_scan_rule_test.cpp - lib/optimizer/strategy/join_ordering_rule_test.cpp - lib/optimizer/strategy/join_predicate_ordering_rule_test.cpp - lib/optimizer/strategy/join_to_predicate_rewrite_rule_test.cpp - lib/optimizer/strategy/join_to_semi_join_rule_test.cpp - lib/optimizer/strategy/null_scan_removal_rule_test.cpp - lib/optimizer/strategy/predicate_merge_rule_test.cpp - lib/optimizer/strategy/predicate_placement_rule_test.cpp - lib/optimizer/strategy/predicate_reordering_rule_test.cpp - lib/optimizer/strategy/predicate_split_up_rule_test.cpp - lib/optimizer/strategy/semi_join_reduction_rule_test.cpp - lib/optimizer/strategy/stored_table_column_alignment_rule_test.cpp - lib/optimizer/strategy/strategy_base_test.cpp - lib/optimizer/strategy/strategy_base_test.hpp - lib/optimizer/strategy/subquery_to_join_rule_test.cpp - lib/scheduler/operator_task_test.cpp - lib/scheduler/scheduler_test.cpp - lib/scheduler/task_queue_test.cpp - lib/server/mock_socket.hpp - lib/server/postgres_protocol_handler_test.cpp - lib/server/query_handler_test.cpp - lib/server/read_buffer_test.cpp - lib/server/result_serializer_test.cpp - lib/server/transaction_handling_test.cpp - lib/server/write_buffer_test.cpp - lib/sql/sql_identifier_resolver_test.cpp - lib/sql/sql_pipeline_statement_test.cpp - lib/sql/sql_pipeline_test.cpp - lib/sql/sql_plan_cache_test.cpp - lib/sql/sql_translator_test.cpp - lib/sql/sqlite_testrunner/sqlite_testrunner_unencoded.cpp - lib/sql/sqlite_testrunner/sqlite_wrapper_test.cpp - lib/statistics/attribute_statistics_test.cpp - lib/statistics/cardinality_estimator_test.cpp - lib/statistics/join_graph_statistics_cache_test.cpp - lib/statistics/statistics_objects/equal_distinct_count_histogram_test.cpp - lib/statistics/statistics_objects/generic_histogram_test.cpp - lib/statistics/statistics_objects/min_max_filter_test.cpp - lib/statistics/statistics_objects/range_filter_test.cpp - lib/statistics/statistics_objects/string_histogram_domain_test.cpp - lib/statistics/table_statistics_test.cpp - lib/storage/any_segment_iterable_test.cpp - lib/storage/chunk_encoder_test.cpp - lib/storage/chunk_test.cpp - lib/storage/compressed_vector_test.cpp - lib/storage/constraints/foreign_key_constraint_test.cpp - lib/storage/constraints/table_key_constraint_test.cpp - lib/storage/constraints/table_order_constraint_test.cpp - lib/storage/dictionary_segment_test.cpp - lib/storage/encoded_segment_test.cpp - lib/storage/encoded_string_segment_test.cpp - lib/storage/encoding_test.hpp - lib/storage/fixed_string_dictionary_segment/fixed_string_test.cpp - lib/storage/fixed_string_dictionary_segment/fixed_string_vector_test.cpp - lib/storage/fixed_string_dictionary_segment_test.cpp - lib/storage/index/adaptive_radix_tree/adaptive_radix_tree_index_test.cpp - lib/storage/index/b_tree/b_tree_index_test.cpp - lib/storage/index/group_key/composite_group_key_index_test.cpp - lib/storage/index/group_key/group_key_index_test.cpp - lib/storage/index/group_key/variable_length_key_base_test.cpp - lib/storage/index/group_key/variable_length_key_store_test.cpp - lib/storage/index/group_key/variable_length_key_test.cpp - lib/storage/index/multi_segment_index_test.cpp - lib/storage/index/partial_hash/partial_hash_index_test.cpp - lib/storage/index/single_segment_index_test.cpp - lib/storage/iterables_test.cpp - lib/storage/lz4_segment_test.cpp - lib/storage/materialize_test.cpp - lib/storage/pos_lists/entire_chunk_pos_list_test.cpp - lib/storage/prepared_plan_test.cpp - lib/storage/reference_segment_test.cpp - lib/storage/segment_access_counter_test.cpp - lib/storage/segment_accessor_test.cpp - lib/storage/segment_iterators_test.cpp - lib/storage/storage_manager_test.cpp - lib/storage/table_column_definition_test.cpp - lib/storage/table_test.cpp - lib/storage/value_segment_test.cpp - lib/tasks/chunk_compression_task_test.cpp - lib/utils/check_table_equal_test.cpp - lib/utils/column_pruning_utils_test.cpp - lib/utils/date_time_utils_test.cpp - lib/utils/format_bytes_test.cpp - lib/utils/format_duration_test.cpp - lib/utils/load_table_test.cpp - lib/utils/log_manager_test.cpp - lib/utils/lossless_predicate_cast_test.cpp - lib/utils/meta_table_manager_test.cpp - lib/utils/meta_tables/meta_exec_table_test.cpp - lib/utils/meta_tables/meta_log_table_test.cpp - lib/utils/meta_tables/meta_mock_table.cpp - lib/utils/meta_tables/meta_mock_table.hpp - lib/utils/meta_tables/meta_plugins_table_test.cpp - lib/utils/meta_tables/meta_segments_accurate_test.cpp - lib/utils/meta_tables/meta_settings_table_test.cpp - lib/utils/meta_tables/meta_system_utilization_table_test.cpp - lib/utils/meta_tables/meta_table_test.cpp - lib/utils/mock_setting.cpp - lib/utils/mock_setting.hpp - lib/utils/plugin_manager_test.cpp - lib/utils/plugin_test_utils.cpp - lib/utils/plugin_test_utils.hpp - lib/utils/print_utils_test.cpp - lib/utils/setting_test.cpp - lib/utils/settings_manager_test.cpp - lib/utils/singleton_test.cpp - lib/utils/size_estimation_utils_test.cpp - lib/utils/string_utils_test.cpp - plugins/mvcc_delete_plugin_test.cpp - plugins/ucc_discovery_plugin_test.cpp - testing_assert.cpp - testing_assert.hpp - utils/data_dependency_test_utils.hpp + HYRISE_UNIT_TEST_SOURCES + ${SHARED_SOURCES} + benchmarklib/sqlite_add_indices_test.cpp + benchmarklib/table_builder_test.cpp + gtest_case_template.cpp + gtest_main.cpp + lib/all_parameter_variant_test.cpp + lib/all_type_variant_test.cpp + lib/cache/cache_test.cpp + lib/concurrency/commit_context_test.cpp + lib/concurrency/transaction_context_test.cpp + lib/concurrency/transaction_manager_test.cpp + lib/cost_estimation/abstract_cost_estimator_test.cpp + lib/expression/evaluation/expression_result_test.cpp + lib/expression/evaluation/like_matcher_test.cpp + lib/expression/expression_evaluator_to_pos_list_test.cpp + lib/expression/expression_evaluator_to_values_test.cpp + lib/expression/expression_test.cpp + lib/expression/expression_utils_test.cpp + lib/expression/lqp_subquery_expression_test.cpp + lib/expression/pqp_subquery_expression_test.cpp + lib/hyrise_test.cpp + lib/import_export/binary/binary_parser_test.cpp + lib/import_export/binary/binary_writer_test.cpp + lib/import_export/csv/csv_meta_test.cpp + lib/import_export/csv/csv_parser_test.cpp + lib/import_export/csv/csv_writer_test.cpp + lib/logical_query_plan/aggregate_node_test.cpp + lib/logical_query_plan/alias_node_test.cpp + lib/logical_query_plan/change_meta_table_node_test.cpp + lib/logical_query_plan/create_prepared_plan_node_test.cpp + lib/logical_query_plan/create_table_node_test.cpp + lib/logical_query_plan/create_view_node_test.cpp + lib/logical_query_plan/data_dependencies/functional_dependency_test.cpp + lib/logical_query_plan/data_dependencies/unique_column_combination_test.cpp + lib/logical_query_plan/delete_node_test.cpp + lib/logical_query_plan/drop_table_node_test.cpp + lib/logical_query_plan/drop_view_node_test.cpp + lib/logical_query_plan/dummy_table_node_test.cpp + lib/logical_query_plan/except_node_test.cpp + lib/logical_query_plan/export_node_test.cpp + lib/logical_query_plan/import_node_test.cpp + lib/logical_query_plan/insert_node_test.cpp + lib/logical_query_plan/intersect_node_test.cpp + lib/logical_query_plan/join_node_test.cpp + lib/logical_query_plan/limit_node_test.cpp + lib/logical_query_plan/logical_query_plan_test.cpp + lib/logical_query_plan/lqp_find_subplan_mismatch_test.cpp + lib/logical_query_plan/lqp_translator_test.cpp + lib/logical_query_plan/lqp_utils_test.cpp + lib/logical_query_plan/mock_node_test.cpp + lib/logical_query_plan/predicate_node_test.cpp + lib/logical_query_plan/projection_node_test.cpp + lib/logical_query_plan/sort_node_test.cpp + lib/logical_query_plan/static_table_node_test.cpp + lib/logical_query_plan/stored_table_node_test.cpp + lib/logical_query_plan/union_node_test.cpp + lib/logical_query_plan/update_node_test.cpp + lib/logical_query_plan/validate_node_test.cpp + lib/lossless_cast_test.cpp + lib/lossy_cast_test.cpp + lib/memory/segments_using_allocators_test.cpp + lib/memory/zero_allocator_test.cpp + lib/null_value_test.cpp + lib/operators/aggregate_sort_test.cpp + lib/operators/aggregate_test.cpp + lib/operators/alias_operator_test.cpp + lib/operators/change_meta_table_test.cpp + lib/operators/delete_test.cpp + lib/operators/difference_test.cpp + lib/operators/export_test.cpp + lib/operators/get_table_test.cpp + lib/operators/import_test.cpp + lib/operators/index_scan_test.cpp + lib/operators/insert_test.cpp + lib/operators/join_hash/join_hash_steps_test.cpp + lib/operators/join_hash/join_hash_traits_test.cpp + lib/operators/join_hash/join_hash_types_test.cpp + lib/operators/join_hash_test.cpp + lib/operators/join_index_test.cpp + lib/operators/join_nested_loop_test.cpp + lib/operators/join_sort_merge_test.cpp + lib/operators/join_test_runner.cpp + lib/operators/join_verification_test.cpp + lib/operators/limit_test.cpp + lib/operators/maintenance/create_prepared_plan_test.cpp + lib/operators/maintenance/create_table_test.cpp + lib/operators/maintenance/create_view_test.cpp + lib/operators/maintenance/drop_table_test.cpp + lib/operators/maintenance/drop_view_test.cpp + lib/operators/operator_clear_output_test.cpp + lib/operators/operator_deep_copy_test.cpp + lib/operators/operator_join_predicate_test.cpp + lib/operators/operator_performance_data_test.cpp + lib/operators/operator_scan_predicate_test.cpp + lib/operators/pqp_utils_test.cpp + lib/operators/print_test.cpp + lib/operators/product_test.cpp + lib/operators/projection_test.cpp + lib/operators/sort_test.cpp + lib/operators/table_scan_between_test.cpp + lib/operators/table_scan_sorted_segment_search_test.cpp + lib/operators/table_scan_string_test.cpp + lib/operators/table_scan_test.cpp + lib/operators/typed_operator_base_test.hpp + lib/operators/union_all_test.cpp + lib/operators/union_positions_test.cpp + lib/operators/update_test.cpp + lib/operators/validate_test.cpp + lib/operators/validate_visibility_test.cpp + lib/optimizer/join_ordering/dp_ccp_test.cpp + lib/optimizer/join_ordering/enumerate_ccp_test.cpp + lib/optimizer/join_ordering/greedy_operator_ordering_test.cpp + lib/optimizer/join_ordering/join_graph_builder_test.cpp + lib/optimizer/join_ordering/join_graph_test.cpp + lib/optimizer/optimizer_test.cpp + lib/optimizer/strategy/between_composition_rule_test.cpp + lib/optimizer/strategy/chunk_pruning_rule_test.cpp + lib/optimizer/strategy/column_pruning_rule_test.cpp + lib/optimizer/strategy/dependent_group_by_reduction_rule_test.cpp + lib/optimizer/strategy/expression_reduction_rule_test.cpp + lib/optimizer/strategy/in_expression_rewrite_rule_test.cpp + lib/optimizer/strategy/index_scan_rule_test.cpp + lib/optimizer/strategy/join_ordering_rule_test.cpp + lib/optimizer/strategy/join_predicate_ordering_rule_test.cpp + lib/optimizer/strategy/join_to_predicate_rewrite_rule_test.cpp + lib/optimizer/strategy/join_to_semi_join_rule_test.cpp + lib/optimizer/strategy/null_scan_removal_rule_test.cpp + lib/optimizer/strategy/predicate_merge_rule_test.cpp + lib/optimizer/strategy/predicate_placement_rule_test.cpp + lib/optimizer/strategy/predicate_reordering_rule_test.cpp + lib/optimizer/strategy/predicate_split_up_rule_test.cpp + lib/optimizer/strategy/semi_join_reduction_rule_test.cpp + lib/optimizer/strategy/stored_table_column_alignment_rule_test.cpp + lib/optimizer/strategy/strategy_base_test.cpp + lib/optimizer/strategy/strategy_base_test.hpp + lib/optimizer/strategy/subquery_to_join_rule_test.cpp + lib/scheduler/operator_task_test.cpp + lib/scheduler/scheduler_test.cpp + lib/scheduler/task_queue_test.cpp + lib/server/mock_socket.hpp + lib/server/postgres_protocol_handler_test.cpp + lib/server/query_handler_test.cpp + lib/server/read_buffer_test.cpp + lib/server/result_serializer_test.cpp + lib/server/transaction_handling_test.cpp + lib/server/write_buffer_test.cpp + lib/sql/sql_identifier_resolver_test.cpp + lib/sql/sql_pipeline_statement_test.cpp + lib/sql/sql_pipeline_test.cpp + lib/sql/sql_plan_cache_test.cpp + lib/sql/sql_translator_test.cpp + lib/sql/sqlite_testrunner/sqlite_testrunner_unencoded.cpp + lib/sql/sqlite_testrunner/sqlite_wrapper_test.cpp + lib/statistics/attribute_statistics_test.cpp + lib/statistics/cardinality_estimator_test.cpp + lib/statistics/join_graph_statistics_cache_test.cpp + lib/statistics/statistics_objects/equal_distinct_count_histogram_test.cpp + lib/statistics/statistics_objects/generic_histogram_test.cpp + lib/statistics/statistics_objects/min_max_filter_test.cpp + lib/statistics/statistics_objects/range_filter_test.cpp + lib/statistics/statistics_objects/string_histogram_domain_test.cpp + lib/statistics/table_statistics_test.cpp + lib/storage/any_segment_iterable_test.cpp + lib/storage/chunk_encoder_test.cpp + lib/storage/chunk_test.cpp + lib/storage/compressed_vector_test.cpp + lib/storage/constraints/foreign_key_constraint_test.cpp + lib/storage/constraints/table_key_constraint_test.cpp + lib/storage/constraints/table_order_constraint_test.cpp + lib/storage/dictionary_segment_test.cpp + lib/storage/encoded_segment_test.cpp + lib/storage/encoded_string_segment_test.cpp + lib/storage/encoding_test.hpp + lib/storage/fixed_string_dictionary_segment/fixed_string_test.cpp + lib/storage/fixed_string_dictionary_segment/fixed_string_vector_test.cpp + lib/storage/fixed_string_dictionary_segment_test.cpp + lib/storage/index/adaptive_radix_tree/adaptive_radix_tree_index_test.cpp + lib/storage/index/b_tree/b_tree_index_test.cpp + lib/storage/index/group_key/composite_group_key_index_test.cpp + lib/storage/index/group_key/group_key_index_test.cpp + lib/storage/index/group_key/variable_length_key_base_test.cpp + lib/storage/index/group_key/variable_length_key_store_test.cpp + lib/storage/index/group_key/variable_length_key_test.cpp + lib/storage/index/multi_segment_index_test.cpp + lib/storage/index/partial_hash/partial_hash_index_test.cpp + lib/storage/index/single_segment_index_test.cpp + lib/storage/iterables_test.cpp + lib/storage/lz4_segment_test.cpp + lib/storage/materialize_test.cpp + lib/storage/pos_lists/entire_chunk_pos_list_test.cpp + lib/storage/prepared_plan_test.cpp + lib/storage/reference_segment_test.cpp + lib/storage/segment_access_counter_test.cpp + lib/storage/segment_accessor_test.cpp + lib/storage/segment_iterators_test.cpp + lib/storage/storage_manager_test.cpp + lib/storage/table_column_definition_test.cpp + lib/storage/table_test.cpp + lib/storage/value_segment_test.cpp + lib/tasks/chunk_compression_task_test.cpp + lib/utils/check_table_equal_test.cpp + lib/utils/column_pruning_utils_test.cpp + lib/utils/date_time_utils_test.cpp + lib/utils/format_bytes_test.cpp + lib/utils/format_duration_test.cpp + lib/utils/load_table_test.cpp + lib/utils/log_manager_test.cpp + lib/utils/lossless_predicate_cast_test.cpp + lib/utils/meta_table_manager_test.cpp + lib/utils/meta_tables/meta_exec_table_test.cpp + lib/utils/meta_tables/meta_log_table_test.cpp + lib/utils/meta_tables/meta_mock_table.cpp + lib/utils/meta_tables/meta_mock_table.hpp + lib/utils/meta_tables/meta_plugins_table_test.cpp + lib/utils/meta_tables/meta_segments_accurate_test.cpp + lib/utils/meta_tables/meta_settings_table_test.cpp + lib/utils/meta_tables/meta_system_utilization_table_test.cpp + lib/utils/meta_tables/meta_table_test.cpp + lib/utils/mock_setting.cpp + lib/utils/mock_setting.hpp + lib/utils/plugin_manager_test.cpp + lib/utils/plugin_test_utils.cpp + lib/utils/plugin_test_utils.hpp + lib/utils/print_utils_test.cpp + lib/utils/setting_test.cpp + lib/utils/settings_manager_test.cpp + lib/utils/singleton_test.cpp + lib/utils/size_estimation_utils_test.cpp + lib/utils/string_utils_test.cpp + plugins/mvcc_delete_plugin_test.cpp + plugins/ucc_discovery_plugin_test.cpp + testing_assert.cpp + testing_assert.hpp + utils/data_dependency_test_utils.hpp ) -set ( - SYSTEM_TEST_SOURCES - ${SHARED_SOURCES} - benchmarklib/synthetic_table_generator_test.cpp - benchmarklib/tpcc/tpcc_test.cpp - benchmarklib/tpcds/tpcds_db_generator_test.cpp - benchmarklib/tpch/tpch_db_generator_test.cpp - gtest_main.cpp - lib/concurrency/stress_test.cpp - lib/server/server_test_runner.cpp - lib/sql/sqlite_testrunner/sqlite_testrunner_encodings.cpp - lib/utils/plugin_test_utils.cpp - lib/utils/plugin_test_utils.hpp - plugins/mvcc_delete_plugin_system_test.cpp -) +set( + SYSTEM_TEST_SOURCES + ${SHARED_SOURCES} + benchmarklib/synthetic_table_generator_test.cpp + benchmarklib/tpcc/tpcc_test.cpp + benchmarklib/tpcds/tpcds_db_generator_test.cpp + benchmarklib/tpch/tpch_db_generator_test.cpp + gtest_main.cpp + lib/concurrency/stress_test.cpp + lib/server/server_test_runner.cpp + lib/sql/sqlite_testrunner/sqlite_testrunner_encodings.cpp + lib/utils/plugin_test_utils.cpp + lib/utils/plugin_test_utils.hpp + plugins/mvcc_delete_plugin_system_test.cpp + lib/storage/variable_string_dictionary_segment_test.cpp) # Both hyriseTest and hyriseSystemTest link against these set( - LIBRARIES - gtest - gmock - SQLite::SQLite3 - # Added plugin targets so that we can test member methods without going through dlsym - hyriseMvccDeletePlugin - hyriseUccDiscoveryPlugin - # Required for testing plugin benchmark hooks - hyriseBenchmarkLib + LIBRARIES + gtest + gmock + SQLite::SQLite3 + # Added plugin targets so that we can test member methods without going through dlsym + hyriseMvccDeletePlugin + hyriseUccDiscoveryPlugin + # Required for testing plugin benchmark hooks + hyriseBenchmarkLib ) # This warning does not play well with SCOPED_TRACE add_compile_options(-Wno-used-but-marked-unused) # We define TEST_PLUGIN_DIR to always load plugins from the correct directory for testing purposes -add_definitions(-DTEST_PLUGIN_DIR="${CMAKE_BINARY_DIR}/lib/") +add_definitions(-DTEST_PLUGIN_DIR= "${CMAKE_BINARY_DIR}/lib/") # Build special sanitizer version of googletest include_directories(../../third_party/googletest/googletest/) @@ -291,18 +291,18 @@ add_executable(hyriseTest ${HYRISE_UNIT_TEST_SOURCES}) add_dependencies(hyriseTest hyriseSecondTestPlugin hyriseTestPlugin hyriseMvccDeletePlugin hyriseTestNonInstantiablePlugin hyriseUccDiscoveryPlugin) target_link_libraries(hyriseTest hyrise ${LIBRARIES}) -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") -target_compile_options(hyriseTest PUBLIC -Xclang -fno-pch-timestamp) -endif() -if(NOT ${CMAKE_VERSION} VERSION_LESS "3.16.0") +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") + target_compile_options(hyriseTest PUBLIC -Xclang -fno-pch-timestamp) +endif () +if (NOT ${CMAKE_VERSION} VERSION_LESS "3.16.0") target_precompile_headers(hyriseTest PRIVATE - - [["all_parameter_variant.hpp"]] - [["storage/create_iterable_from_segment.hpp"]] - [["storage/table.hpp"]] - [["types.hpp"]] - ) -endif() + + [["all_parameter_variant.hpp"]] + [["storage/create_iterable_from_segment.hpp"]] + [["storage/table.hpp"]] + [["types.hpp"]] + ) +endif () # Configure hyriseSystemTest add_executable(hyriseSystemTest ${SYSTEM_TEST_SOURCES}) diff --git a/src/test/lib/operators/join_test_runner.cpp b/src/test/lib/operators/join_test_runner.cpp index 058b75845b..e3bafae9c6 100644 --- a/src/test/lib/operators/join_test_runner.cpp +++ b/src/test/lib/operators/join_test_runner.cpp @@ -111,7 +111,7 @@ bool operator<(const JoinTestConfiguration& l, const JoinTestConfiguration& r) { return l.to_tuple() < r.to_tuple(); } -bool operator==(const JoinTestConfiguration& l, const JoinTestConfiguration& r) { +bool __attribute__((used)) operator==(const JoinTestConfiguration& l, const JoinTestConfiguration& r) { return l.to_tuple() == r.to_tuple(); } @@ -769,6 +769,7 @@ TEST_P(JoinTestRunner, TestJoin) { } } } + // //INSTANTIATE_TEST_SUITE_P(JoinNestedLoop, JoinTestRunner, // testing::ValuesIn(JoinTestRunner::create_configurations())); diff --git a/src/test/lib/storage/variable_string_dictionary_segment_test.cpp b/src/test/lib/storage/variable_string_dictionary_segment_test.cpp new file mode 100644 index 0000000000..dbb5bca55e --- /dev/null +++ b/src/test/lib/storage/variable_string_dictionary_segment_test.cpp @@ -0,0 +1,168 @@ +#include +#include +#include + +#include "base_test.hpp" + +#include "storage/chunk_encoder.hpp" +#include "storage/segment_encoding_utils.hpp" +#include "storage/value_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" +#include "storage/vector_compression/fixed_width_integer/fixed_width_integer_vector.hpp" + +namespace hyrise { + +class StorageVariableStringDictionarySegmentTest : public BaseTest { + protected: + std::shared_ptr> vs_str = std::make_shared>(); +}; + +TEST_F(StorageVariableStringDictionarySegmentTest, CompressSegmentString) { + vs_str->append("Bill"); + vs_str->append("Steve"); + vs_str->append("Alexander"); + vs_str->append("Steve"); + vs_str->append("Hasso"); + vs_str->append("Bill"); + + auto segment = ChunkEncoder::encode_segment(vs_str, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + auto dict_segment = std::dynamic_pointer_cast(segment); + + // Test attribute_vector size + EXPECT_EQ(dict_segment->size(), 6u); + EXPECT_EQ(dict_segment->attribute_vector()->size(), 6u); + + // Test dictionary size (uniqueness) + EXPECT_EQ(dict_segment->unique_values_count(), 4u); + + // Test sorting + auto dict = dict_segment->dictionary(); + // TODO: Replace with working code + // EXPECT_EQ(*(dict->begin()), "Alexander"); + // EXPECT_EQ(*(dict->begin() + 1), "Bill"); + // EXPECT_EQ(*(dict->begin() + 2), "Hasso"); + // EXPECT_EQ(*(dict->begin() + 3), "Steve"); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, Decode) { + vs_str->append("Bill"); + vs_str->append("Steve"); + vs_str->append("Bill"); + + auto segment = ChunkEncoder::encode_segment(vs_str, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + auto dict_segment = std::dynamic_pointer_cast(segment); + + EXPECT_EQ(dict_segment->encoding_type(), EncodingType::VariableStringDictionary); + EXPECT_EQ(dict_segment->compressed_vector_type(), CompressedVectorType::FixedWidthInteger1Byte); + + // Decode values + EXPECT_EQ((*dict_segment)[ChunkOffset{0}], AllTypeVariant("Bill")); + EXPECT_EQ((*dict_segment)[ChunkOffset{1}], AllTypeVariant("Steve")); + EXPECT_EQ((*dict_segment)[ChunkOffset{2}], AllTypeVariant("Bill")); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, LongStrings) { + vs_str->append("ThisIsAVeryLongStringThisIsAVeryLongStringThisIsAVeryLongString"); + vs_str->append("QuiteShort"); + vs_str->append("Short"); + + auto segment = ChunkEncoder::encode_segment(vs_str, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + auto dict_segment = std::dynamic_pointer_cast(segment); + + // Test sorting + auto dict = dict_segment->dictionary(); + // TODO: Here + // EXPECT_EQ(*(dict->begin()), "QuiteShort"); + // EXPECT_EQ(*(dict->begin() + 1), "Short"); + // EXPECT_EQ(*(dict->begin() + 2), "ThisIsAVeryLongStringThisIsAVeryLongStringThisIsAVeryLongString"); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, LowerUpperBound) { + vs_str->append("A"); + vs_str->append("C"); + vs_str->append("E"); + vs_str->append("G"); + vs_str->append("I"); + vs_str->append("K"); + + auto segment = ChunkEncoder::encode_segment(vs_str, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + auto dict_segment = std::dynamic_pointer_cast(segment); + + // Test for AllTypeVariant as parameter + EXPECT_EQ(dict_segment->lower_bound(AllTypeVariant("E")), ValueID{2}); + EXPECT_EQ(dict_segment->upper_bound(AllTypeVariant("E")), ValueID{3}); + + EXPECT_EQ(dict_segment->lower_bound(AllTypeVariant("F")), ValueID{3}); + EXPECT_EQ(dict_segment->upper_bound(AllTypeVariant("F")), ValueID{3}); + + EXPECT_EQ(dict_segment->lower_bound(AllTypeVariant("Z")), INVALID_VALUE_ID); + EXPECT_EQ(dict_segment->upper_bound(AllTypeVariant("Z")), INVALID_VALUE_ID); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, NullValues) { + std::shared_ptr> vs_str = std::make_shared>(true); + + vs_str->append("A"); + vs_str->append(NULL_VALUE); + vs_str->append("E"); + + auto segment = ChunkEncoder::encode_segment(vs_str, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + auto dict_segment = std::dynamic_pointer_cast(segment); + + EXPECT_EQ(dict_segment->null_value_id(), 2u); + EXPECT_TRUE(variant_is_null((*dict_segment)[ChunkOffset{1}])); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, MemoryUsageEstimation) { + /** + * WARNING: Since it's hard to assert what constitutes a correct "estimation", this just tests basic sanity of the + * memory usage estimations + */ + const auto empty_compressed_segment = ChunkEncoder::encode_segment( + vs_str, DataType::String, SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto empty_dictionary_segment = + std::dynamic_pointer_cast(empty_compressed_segment); + // TODO: Why are we passing mode here? + const auto empty_memory_usage = empty_dictionary_segment->memory_usage(MemoryUsageCalculationMode::Full); + + vs_str->append("A"); + vs_str->append("B"); + vs_str->append("C"); + const auto compressed_segment = ChunkEncoder::encode_segment( + vs_str, DataType::String, SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto dictionary_segment = std::dynamic_pointer_cast(compressed_segment); + + static constexpr auto size_of_attribute = 1u; + static constexpr auto size_of_dictionary = 3u; + + // We have to substract 1 since the empty VariableStringSegment actually contains one null terminator + // TODO: Why are we specifying mode? + EXPECT_EQ(dictionary_segment->memory_usage(MemoryUsageCalculationMode::Full), + empty_memory_usage - 1u + 3 * size_of_attribute + size_of_dictionary); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, TestLookup) { + const auto allocator = PolymorphicAllocator{}; + // 1. Create string data (klotz) + // Contains zero-length string at the end, just to be annoying. + const auto data = std::array{"Hello\0World\0Alexander\0String\0"}; + const auto klotz = std::make_shared>(); + klotz->resize(data.size()); + std::memcpy(klotz->data(), data.data(), data.size()); + const pmr_vector offsets{0, 6, 12, 22, 29}; + const pmr_vector attribute_vector{0, 0, 1, 3, 2, 4, 2}; + const auto segment = + VariableStringDictionarySegment{klotz, + std::shared_ptr(compress_vector( + attribute_vector, VectorCompressionType::FixedWidthInteger, allocator, {4})), + std::make_shared>(offsets)}; + + // TODO: Actually test lookup and compare with data. +} + +} // namespace hyrise From a655eada47f0b35a6b8989a37480b90ce22ffb8c Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Tue, 4 Jul 2023 12:31:31 +0200 Subject: [PATCH 11/71] Hide offsets, instead surface only value ids in variable string segment --- .../variable_string_dictionary_encoder.hpp | 43 +++++++++++++------ .../variable_string_dictionary_segment.cpp | 9 ++-- .../variable_string_dictionary_segment.hpp | 13 +++--- ...ariable_string_dictionary_segment_test.cpp | 21 +++++++-- 4 files changed, 57 insertions(+), 29 deletions(-) diff --git a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp index 471225a95e..c8d161dab8 100644 --- a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp +++ b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp @@ -45,7 +45,7 @@ class VariableStringDictionaryEncoder : public SegmentEncoder>(pmr_vector(total_size)); // We assume segment size up to 4 GByte. auto string_offsets = std::unordered_map(); + auto string_value_ids = std::unordered_map(); auto current_offset = uint32_t{0}; + auto current_value_id = ValueID{0}; for (const auto& value : dense_values) { memcpy(klotz->data() + current_offset, value.c_str(), value.size()); string_offsets[value] = current_offset; + string_value_ids[value] = current_value_id++; current_offset += value.size() + 1; } - auto position_to_offset = std::make_shared>(pmr_vector(segment_size)); + auto position_to_value_id = std::make_shared>(pmr_vector(segment_size)); + auto offset_vector = std::make_shared>(pmr_vector(dense_values.size())); + offset_vector->shrink_to_fit(); for (const auto& [string, positions] : string_to_positions) { - const auto here = string_offsets[string]; + const auto here_offset = string_offsets[string]; + const auto here_value_id = string_value_ids[string]; + (*offset_vector)[here_value_id] = here_offset; for (const auto position : positions) { - (*position_to_offset)[position] = here; + (*position_to_value_id)[position] = here_value_id; } } - const auto max_value_id = dense_values.size(); - const auto compressed_position_to_offset = std::shared_ptr( - compress_vector(*position_to_offset, SegmentEncoder::vector_compression_type(), + const auto null_value_id = dense_values.size(); + for (auto offset = ChunkOffset{0}; offset < segment_size; ++offset) { + const auto is_null = null_values[offset]; + if (is_null) { + (*position_to_value_id)[offset] = null_value_id; + } + } + + const auto max_value_id = current_value_id; + const auto compressed_position_to_value_id = std::shared_ptr( + compress_vector(*position_to_value_id, SegmentEncoder::vector_compression_type(), allocator, {max_value_id})); - return std::make_shared(klotz, compressed_position_to_offset, position_to_offset); + return std::make_shared(klotz, compressed_position_to_value_id, offset_vector); } - private: - template - static ValueID _get_value_id(const U& dictionary, const T& value) { - return ValueID{static_cast( - std::distance(dictionary->cbegin(), std::lower_bound(dictionary->cbegin(), dictionary->cend(), value)))}; - } +// private: +// template +// static ValueID _get_value_id(const U& dictionary, const T& value) { +// return ValueID{static_cast( +// std::distance(dictionary->cbegin(), std::lower_bound(dictionary->cbegin(), dictionary->cend(), value)))}; +// } }; } // namespace hyrise diff --git a/src/lib/storage/variable_string_dictionary_segment.cpp b/src/lib/storage/variable_string_dictionary_segment.cpp index 84f0947ad7..126ece4268 100644 --- a/src/lib/storage/variable_string_dictionary_segment.cpp +++ b/src/lib/storage/variable_string_dictionary_segment.cpp @@ -8,12 +8,11 @@ namespace hyrise { VariableStringDictionarySegment::VariableStringDictionarySegment( const std::shared_ptr>& dictionary, const std::shared_ptr& attribute_vector, - const std::shared_ptr> offset_vector) + const std::shared_ptr>& offset_vector) : BaseDictionarySegment(data_type_from_type()), _dictionary{dictionary}, _attribute_vector{attribute_vector}, _decompressor{attribute_vector->create_base_decompressor()}, - _unique_value_count(std::count(dictionary->begin(), dictionary->end(), '\0')), _offset_vector{offset_vector} { // NULL is represented by _offset_vector.size(). INVALID_VALUE_ID, which is the highest possible number in // ValueID::base_type (2^32 - 1), is needed to represent "value not found" in calls to lower_bound/upper_bound. @@ -29,7 +28,7 @@ std::shared_ptr> VariableStringDictionarySegment::diction AllTypeVariant VariableStringDictionarySegment::operator[](const ChunkOffset chunk_offset) const { PerformanceWarning("operator[] used"); DebugAssert(chunk_offset != INVALID_CHUNK_OFFSET, "Passed chunk offset must be valid."); - access_counter[SegmentAccessCounter::AccessType::Dictionary] += 1; + // access_counter[SegmentAccessCounter::AccessType::Dictionary] += 1; const auto value = get_typed_value(chunk_offset); return value ? value.value() : NULL_VALUE; } @@ -48,7 +47,7 @@ std::shared_ptr VariableStringDictionarySegment::copy_using_all return copy; } -size_t VariableStringDictionarySegment::memory_usage(const MemoryUsageCalculationMode mode) const { +size_t VariableStringDictionarySegment::memory_usage(const MemoryUsageCalculationMode /*mode*/) const { return _attribute_vector->data_size() + _dictionary->capacity() + _offset_vector->capacity(); } @@ -113,7 +112,7 @@ pmr_string VariableStringDictionarySegment::typed_value_of_value_id(const ValueI } ValueID::base_type VariableStringDictionarySegment::unique_values_count() const { - return _unique_value_count; + return _offset_vector->size(); } std::shared_ptr VariableStringDictionarySegment::attribute_vector() const { diff --git a/src/lib/storage/variable_string_dictionary_segment.hpp b/src/lib/storage/variable_string_dictionary_segment.hpp index cc280c2f23..cf8d5826ad 100644 --- a/src/lib/storage/variable_string_dictionary_segment.hpp +++ b/src/lib/storage/variable_string_dictionary_segment.hpp @@ -20,10 +20,10 @@ class VariableStringDictionarySegment : public BaseDictionarySegment { public: VariableStringDictionarySegment(const std::shared_ptr>& dictionary, const std::shared_ptr& attribute_vector, - const std::shared_ptr> offset_vector); + const std::shared_ptr>& offset_vector); // TODO:: remove - VariableStringDictionarySegment() : BaseDictionarySegment(DataType::String), _unique_value_count(0){}; + VariableStringDictionarySegment() : BaseDictionarySegment(DataType::String) {}; // returns an underlying dictionary std::shared_ptr> dictionary() const; @@ -36,12 +36,12 @@ class VariableStringDictionarySegment : public BaseDictionarySegment { std::optional get_typed_value(const ChunkOffset chunk_offset) const { // performance critical - not in cpp to help with inlining - const auto offset = _decompressor->get(chunk_offset); - if (offset == _dictionary->size()) { + const auto value_id = _decompressor->get(chunk_offset); + if (value_id == _offset_vector->size()) { return std::nullopt; } - return pmr_string(_dictionary->data() + offset); + return typed_value_of_value_id(ValueID{value_id}); } ChunkOffset size() const final; @@ -94,10 +94,9 @@ class VariableStringDictionarySegment : public BaseDictionarySegment { protected: const std::shared_ptr> _dictionary; - // Maps chunk offsets to dictionary offsets. + // Maps chunk offsets to value ids. const std::shared_ptr _attribute_vector; std::unique_ptr _decompressor; - const size_t _unique_value_count; // Maps value ids to dictionary offsets. const std::shared_ptr> _offset_vector; }; diff --git a/src/test/lib/storage/variable_string_dictionary_segment_test.cpp b/src/test/lib/storage/variable_string_dictionary_segment_test.cpp index dbb5bca55e..eb247af3d4 100644 --- a/src/test/lib/storage/variable_string_dictionary_segment_test.cpp +++ b/src/test/lib/storage/variable_string_dictionary_segment_test.cpp @@ -127,7 +127,6 @@ TEST_F(StorageVariableStringDictionarySegmentTest, MemoryUsageEstimation) { vs_str, DataType::String, SegmentEncodingSpec{EncodingType::VariableStringDictionary}); const auto empty_dictionary_segment = std::dynamic_pointer_cast(empty_compressed_segment); - // TODO: Why are we passing mode here? const auto empty_memory_usage = empty_dictionary_segment->memory_usage(MemoryUsageCalculationMode::Full); vs_str->append("A"); @@ -141,7 +140,6 @@ TEST_F(StorageVariableStringDictionarySegmentTest, MemoryUsageEstimation) { static constexpr auto size_of_dictionary = 3u; // We have to substract 1 since the empty VariableStringSegment actually contains one null terminator - // TODO: Why are we specifying mode? EXPECT_EQ(dictionary_segment->memory_usage(MemoryUsageCalculationMode::Full), empty_memory_usage - 1u + 3 * size_of_attribute + size_of_dictionary); } @@ -162,7 +160,24 @@ TEST_F(StorageVariableStringDictionarySegmentTest, TestLookup) { attribute_vector, VectorCompressionType::FixedWidthInteger, allocator, {4})), std::make_shared>(offsets)}; - // TODO: Actually test lookup and compare with data. + auto accessors = std::vector>{ + +[](const VariableStringDictionarySegment& segment, const ChunkOffset offset) { + const auto maybe = segment.get_typed_value(offset); + return maybe ? maybe.value() : NULL_VALUE; + }, + +[](const VariableStringDictionarySegment& segment, const ChunkOffset offset) { + return segment[offset]; + }}; + for (const auto& accessor : accessors) { + EXPECT_EQ(accessor(segment, ChunkOffset{0}), AllTypeVariant{"Hello"}); + EXPECT_EQ(accessor(segment, ChunkOffset{1}), AllTypeVariant{"Hello"}); + EXPECT_EQ(accessor(segment, ChunkOffset{2}), AllTypeVariant{"World"}); + EXPECT_EQ(accessor(segment, ChunkOffset{3}), AllTypeVariant{"String"}); + EXPECT_EQ(accessor(segment, ChunkOffset{4}), AllTypeVariant{"Alexander"}); + EXPECT_EQ(accessor(segment, ChunkOffset{5}), AllTypeVariant{""}); + EXPECT_EQ(accessor(segment, ChunkOffset{6}), AllTypeVariant{"Alexander"}); + } + // EXPECT_EQ(segment.) } } // namespace hyrise From bc86009f235b3a502cc664809f1f2e1cb02e956c Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Tue, 4 Jul 2023 20:06:56 +0200 Subject: [PATCH 12/71] WIP: Some work in the script --- scripts/evaluate_string_segments.py | 213 +++++++++++++++++++++------- 1 file changed, 163 insertions(+), 50 deletions(-) diff --git a/scripts/evaluate_string_segments.py b/scripts/evaluate_string_segments.py index 862f26bf52..5bd036a9c3 100755 --- a/scripts/evaluate_string_segments.py +++ b/scripts/evaluate_string_segments.py @@ -2,7 +2,7 @@ import json import shutil from abc import ABC, abstractmethod -from argparse import ArgumentParser, ArgumentTypeError +from argparse import ArgumentParser, ArgumentTypeError, BooleanOptionalAction from dataclasses import dataclass import multiprocessing from os import path @@ -18,6 +18,7 @@ VERBOSITY_LEVEL = 0 +FAIL_FAST = False def print_debug(*args, required_verbosity_level: int, **kwargs) -> None: if VERBOSITY_LEVEL >= required_verbosity_level: @@ -57,11 +58,78 @@ class Configuration: encoding_under_test: str time_limit: int verbosity_level: int + + +class DictConvertible(ABC): + @abstractmethod + def as_dict(self) -> dict: + ... + + +@dataclass(frozen=True, slots=True) +class Runtime(DictConvertible): + benchmark_name: str + items_per_second: float + + def as_dict(self) -> dict: + return { + 'benchmark_name': self.benchmark_name, + 'items_per_second': self.items_per_second, + } + + +@dataclass(slots=True) +class Runtimes(DictConvertible): + runtimes: list[Runtime] + + def as_dict(self) -> dict: + return { + 'runtimes': list(runtime.as_dict() for runtime in self.runtimes) + } + + + +@dataclass(frozen=True, slots=True) +class MemoryConsumption(DictConvertible): + column_name: str + column_type: str + memory_consumption: int + + def as_dict(self) -> dict: + return { + 'column_name': self.column_name, + 'column_type': self.column_type, + 'memory_consumption': self.memory_consumption, + } + + +@dataclass(slots=True) +class Metrics(DictConvertible): + memory_consumptions: list[MemoryConsumption] + def as_dict(self) -> dict: + return { + 'memory_consumption': list(consumption.as_dict() for consumption in self.memory_consumptions) + } + + def min(self) -> int: + return min(map(lambda x: x.memory_consumption, self.memory_consumptions)) + + def max(self) -> int: + return max(map(lambda x: x.memory_consumption, self.memory_consumptions)) + + def average(self) -> float: + return statistics.fmean(map(lambda x: x.memory_consumption, self.memory_consumptions)) -def plot(results: Mapping[str, float], *, title: str, yaxis: str, path: str, figsize: tuple[int, int]=(15, 10)) -> None: + def median(self) -> float: + return statistics.median(map(lambda x: x.memory_consumption, self.memory_consumptions)) + + +def plot(results: dict[str, list], *, title: str, yaxis: str, path: str, figsize: tuple[int, int]=(15, 10)) -> None: f, axiis = plt.subplots(1, 1, figsize=figsize) - data=pd.DataFrame(data=results) + # data=pd.DataFrame(data=results) + data = pd.DataFrame.from_dict(results, orient='index') + data = data.transpose() print_debug(data, required_verbosity_level=3) if data.empty: print_error('Data Frame is empty; no result data to show!') @@ -79,6 +147,7 @@ def plot(results: Mapping[str, float], *, title: str, yaxis: str, path: str, fig class Benchmark(ABC): + name = 'Benchmark' _config: Configuration # These encodings are supported by default for string types. @@ -100,42 +169,54 @@ def create(name: str, *args: Any, **kwargs: Any) -> 'Benchmark': } return classes[name](*args, **kwargs) - def run(self, config: Configuration) -> None: + def run(self, config: Configuration) -> Mapping[str, Runtimes] | None: try: self._config = config output(f'## Benchmark {self.name}:', required_verbosity_level=1) - for threading in ['ST', 'MT']: + threading_options: list[Literal['ST', 'MT']] = ['ST', 'MT'] + for threading in threading_options: output(f'### {"Single Thread" if threading == "ST" else "Mulitple Threads"}', required_verbosity_level=1) result_jsons = [self._run(threading, encoding, False) for encoding in self._encodings] test_json = self._run(threading, self._config.encoding_under_test, False) - times = {encoding: self._compare_run(result_json) for result_json, encoding in zip(result_jsons, self._encodings)} + times = { + encoding: self._compare_run(result_json) + for result_json, encoding + in zip(result_jsons, self._encodings) + } times[self._config.encoding_under_test] = self._compare_run(test_json) plot_path = path.join(self._config.output_directory, 'runtime', 'plots', f'{self.name}-{threading}.png') Path(plot_path).parent.mkdir(parents=True, exist_ok=True) - plot(times, title=f'Items per second for {self.name}', yaxis='Items per second of individual benchmark tests', path=plot_path) + times_to_plot = { + encoding: list(map(lambda x: x.items_per_second, runtimes.runtimes)) + for encoding, runtimes + in times.items() + } + plot(times_to_plot, title=f'Items per second for {self.name}', yaxis='Items per second of individual benchmark tests', path=plot_path) # Dump raw data raw_file_path = path.join(self._config.output_directory, 'runtime', 'raw', f'{self.name}-{threading}.json') Path(raw_file_path).parent.mkdir(parents=True, exist_ok=True) with open(raw_file_path, 'w') as f: - json.dump(times, f) + raw_times = { + encoding: runtimes.as_dict() + for encoding, runtimes + in times.items() + } + json.dump(raw_times, f) # for (encoding, json) in zip(self._encodings, result_jsons): # check_command = ['./scripts/compare_benchmarks.py', json, test_json] # compare_result = check_output(check_command) # output(f'Result for {encoding} vs. {self._config.encoding_under_test}:', required_verbosity_level=3) # output(compare_result.decode(encoding='utf-8'), required_verbosity_level=3) + return times except Exception as ex: + if FAIL_FAST: + raise ex error_message = f'Skipped {self.name} due to error {ex}.' print_error(error_message) output(error_message, required_verbosity_level=4) + return None - def _compare_run(self, json: str) -> list[float]: - @dataclass(frozen=True) - class Runtime: - benchmark_name: str - runtime: float - @dataclass - class Runtimes: - runtimes: list[Runtime] + def _compare_run(self, json: str) -> Runtimes: json_file = read_json(json) runtimes = Runtimes([]) for benchmark in json_file['benchmarks']: @@ -145,9 +226,10 @@ class Runtimes: # duration = statistics.mean(float(run['duration']) for run in successful_runs) runtime = Runtime(name, duration) runtimes.runtimes.append(runtime) - return list(map(lambda x: x.runtime, runtimes.runtimes)) + # return list(map(lambda x: x.runtime, runtimes.runtimes)) + return runtimes - def compare_metrics(self, config: Configuration) -> None: + def compare_metrics(self, config: Configuration) -> Mapping[str, Metrics] | None: try: self._config = config output(f'### Collecting Metrics for {self.name}:', required_verbosity_level=1) @@ -157,41 +239,34 @@ def compare_metrics(self, config: Configuration) -> None: metrics[self._config.encoding_under_test]= self._compare_metrics(test_json) plot_path = path.join(self._config.output_directory, 'metrics', 'plots', f'{self.name}.png') Path(plot_path).parent.mkdir(parents=True, exist_ok=True) - plot(metrics, title=f'Sizes for {self.name}', yaxis='Size of Segments', path=plot_path) + metrics_to_plot: dict[str, list[int]] = {} + for encoding, metric in metrics.items(): + metrics_to_plot[f'{encoding} (String)'] = list(map(lambda x: x.memory_consumption, filter(lambda x: x.column_type == 'string', metric.memory_consumptions))) + metrics_to_plot[f'{encoding} (All)'] = list(map(lambda x: x.memory_consumption, metric.memory_consumptions)) + plot(metrics_to_plot, title=f'Sizes for {self.name}', yaxis='Size of Segments', path=plot_path, figsize=(22, 10)) # Dump raw data raw_file_path = path.join(self._config.output_directory, 'metrics', 'raw', f'{self.name}.json') Path(raw_file_path).parent.mkdir(parents=True, exist_ok=True) with open(raw_file_path, 'w') as f: - json.dump(metrics, f) + raw_metrics = { + encoding: metric.as_dict() + for encoding, metric + in metrics.items() + } + json.dump(raw_metrics, f) # for encoding, json in zip(self._encodings, result_jsons): # self._compare_metrics(json, test_json, encoding, self._config.encoding_under_test) + return metrics except Exception as ex: + if FAIL_FAST: + raise ex error_message = f'Skipped {self.name} due to error {ex}.' print_error(error_message) output(error_message, required_verbosity_level=4) + return None # def _compare_metrics(self, reference_json: str, test_json: str, reference_encoding: str, test_encoding: str) -> 'Metrics': - def _compare_metrics(self, json: str) -> list[int]: - @dataclass(frozen=True) - class MemoryConsumption: - column_name: str - column_type: str - memory_consumption: int - @dataclass - class Metrics: - memory_consumptions: list[MemoryConsumption] - - def min(self) -> int: - return min(map(lambda x: x.memory_consumption, self.memory_consumptions)) - - def max(self) -> int: - return max(map(lambda x: x.memory_consumption, self.memory_consumptions)) - - def average(self) -> float: - return statistics.fmean(map(lambda x: x.memory_consumption, self.memory_consumptions)) - - def median(self) -> float: - return statistics.median(map(lambda x: x.memory_consumption, self.memory_consumptions)) + def _compare_metrics(self, json: str) -> Metrics: json_file = read_json(json) # test = read_json(test_json) metrics = Metrics([]) @@ -200,14 +275,15 @@ def median(self) -> float: # for ref_segment, test_segment in zip(reference['segments'], test['segments']): column_type = segment['column_data_type'] column_name = segment['column_name'] - # Only look at string columns - if column_type != 'string': - continue + # # Only look at string columns + # if column_type != 'string': + # continue metrics.memory_consumptions.append(MemoryConsumption(column_name, column_type, segment["estimated_size_in_bytes"])) # test_metrics.memory_consumptions.append(MemoryConsumption(column_name, column_type, test_segment["estimated_size_in_bytes"])) # output(f'For reference encoding {reference_encoding}: {{ min: {reference_metrics.min()}, max: {reference_metrics.max()}, average: {reference_metrics.average()}, median: {reference_metrics.median()} }}; ' + # f'For test encoding {test_encoding}: {{ min: {test_metrics.min()}, max: {test_metrics.max()}, average: {test_metrics.average()}, median: {test_metrics.median()} }}', required_verbosity_level=2) - return list(map(lambda x: x.memory_consumption, metrics.memory_consumptions)) + # return list(map(lambda x: x.memory_consumption, metrics.memory_consumptions)) + return metrics def _run(self, threading: Literal['ST', 'MT'], encoding: str, metrics: bool) -> str: self._pre_run_cleanup() @@ -232,11 +308,11 @@ def _get_arguments(self, threading: Literal['ST', 'MT'], encoding: str, metrics: @abstractmethod def _pre_run_cleanup(self) -> None: - pass + ... @property def _path(self) -> str: - pass + ... def _output_path(self, threading: Literal['ST'] | Literal['MT'], encoding: str, metrics: bool) -> str: return path.join(self._config.tmp_path, f'{self.name}-{threading}-{encoding}-{metrics}.json') @@ -295,7 +371,8 @@ def _path(self) -> str: def parse_arguments() -> Configuration: global VERBOSITY_LEVEL - def check_positive(value: any) -> int: + global FAIL_FAST + def check_positive(value) -> int: ivalue = int(value) if ivalue <= 0: raise ArgumentTypeError("%s is an invalid positive int value" % value) @@ -360,10 +437,21 @@ def check_positive(value: any) -> int: action='count', help='Verbosity level. Supports multiple ocurrences (like `-vvv`) to increase verbosity.' ) + parser.add_argument( + '--fail-fast', + dest='fail_fast', + action=BooleanOptionalAction, + required=False, + default=False, + help='Whether to fail early when an error occurs' + ) namespace = parser.parse_args() if namespace.verbosity_level > 0: VERBOSITY_LEVEL = namespace.verbosity_level print_debug(f'Verbosity Mode enabled on level {VERBOSITY_LEVEL}.', required_verbosity_level=VERBOSITY_LEVEL) + if namespace.fail_fast: + FAIL_FAST = True + print_debug(f'Fail Fast Mode enabled', required_verbosity_level=2) now = datetime.now() now_date = f'{now.year}{now.month:02d}{now.day:02d}' now_time = f'{now.hour}{now.minute:02d}{now.second:02d}' @@ -393,6 +481,27 @@ def locate_benchmarks(benchmarks: list[str], config: Configuration) -> list[Benc return benchmark_objects +def plot_stats(stats: dict[str, tuple[Mapping[str, Runtimes], Mapping[str, Metrics]]], *, figsize: tuple[int, int]=(15, 10)) -> None: + f, axiis = plt.subplots(2, 2, figsize=figsize) + # data=pd.DataFrame(data=results) + data = pd.DataFrame.from_dict(stats, orient='index') + data = data.transpose() + print_debug(data, required_verbosity_level=3) + if data.empty: + print_error('Data Frame is empty; no result data to show!') + return + data.plot( + kind='box', + ax=axiis, + title=title, + xlabel='Encodings', + ylabel=f'{yaxis} (Logarithmic Scale)', + logy=True, + ) + f.tight_layout() + f.savefig(path) + + def main(): global output_file config = parse_arguments() @@ -403,9 +512,13 @@ def main(): print_debug(f'Running benchmark comparing {config.encoding_under_test} Encoding against built-in encodings.', required_verbosity_level=1) benchmarks_names = ['hyriseBenchmarkTPCH', 'hyriseBenchmarkTPCDS', 'hyriseBenchmarkJoinOrder', 'hyriseBenchmarkStarSchema'] benchmarks = locate_benchmarks(benchmarks_names, config) + stats: dict[str, tuple[Mapping[str, Runtimes], Mapping[str, Metrics]]] = {} for benchmark in benchmarks: - benchmark.run(config) - benchmark.compare_metrics(config) + runtimes = benchmark.run(config) + metrics = benchmark.compare_metrics(config) + if runtimes is not None and metrics is not None: + stats[benchmark.name] = (runtimes, metrics) + plot_stats(stats) if __name__ == '__main__': From c17acefaa1940335c6a9fc84f88542f9ba70d7dc Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Wed, 5 Jul 2023 13:45:17 +0200 Subject: [PATCH 13/71] Add skip benchmarks option --- scripts/evaluate_string_segments.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/scripts/evaluate_string_segments.py b/scripts/evaluate_string_segments.py index 5bd036a9c3..0244ee5771 100755 --- a/scripts/evaluate_string_segments.py +++ b/scripts/evaluate_string_segments.py @@ -58,6 +58,7 @@ class Configuration: encoding_under_test: str time_limit: int verbosity_level: int + skip_benchmarks: bool class DictConvertible(ABC): @@ -286,6 +287,9 @@ def _compare_metrics(self, json: str) -> Metrics: return metrics def _run(self, threading: Literal['ST', 'MT'], encoding: str, metrics: bool) -> str: + if self._config.skip_benchmarks: + print_debug(f'Skipping benchmark {self.name} for encoding {encoding}.', required_verbosity_level=2) + return self._output_path(threading, encoding, metrics) self._pre_run_cleanup() print_debug(f'Running benchmark {self.name} for encoding {encoding}.', required_verbosity_level=1) st_command = self._get_arguments(threading, encoding, metrics) @@ -445,6 +449,14 @@ def check_positive(value) -> int: default=False, help='Whether to fail early when an error occurs' ) + parser.add_argument( + '--skip-benchmarks', + dest='skip_benchmarks', + action=BooleanOptionalAction, + required=False, + default=False, + help='Whether to skip running the benchmarks and instead using old data.' + ) namespace = parser.parse_args() if namespace.verbosity_level > 0: VERBOSITY_LEVEL = namespace.verbosity_level @@ -462,7 +474,8 @@ def check_positive(value) -> int: scale_factor=namespace.scale_factor, encoding_under_test=namespace.encoding, time_limit=namespace.timeout, - verbosity_level=namespace.verbosity_level) + verbosity_level=namespace.verbosity_level, + skip_benchmarks=namespace.skip_benchmarks) def scale_factor_for_benchmark(benchmark: str, scale_factor: float) -> float: From 2aba648fb4c85419ac7a4e374cc092e3c40b77ec Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Wed, 5 Jul 2023 15:59:31 +0200 Subject: [PATCH 14/71] WIP: Fix some issues --- .../variable_string_dictionary_encoder.hpp | 22 ++++++++++--------- .../variable_string_dictionary_segment.cpp | 2 +- .../variable_string_dictionary_segment.hpp | 4 +++- src/test/CMakeLists.txt | 2 +- src/test/lib/operators/join_test_runner.cpp | 1 + .../lib/operators/table_scan_between_test.cpp | 18 +++++++-------- 6 files changed, 27 insertions(+), 22 deletions(-) diff --git a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp index c8d161dab8..8d0ad66bd0 100644 --- a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp +++ b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp @@ -34,7 +34,7 @@ class VariableStringDictionaryEncoder : public SegmentEncoder(); // contains the actual values (no NULLs) auto null_values = std::vector(); // bitmap to mark NULL values - auto string_to_positions = std::unordered_map>(); + auto string_to_chunk_offsets = std::unordered_map>(); auto segment_size = uint32_t{0}; segment_iterable.with_iterators([&](auto segment_it, const auto segment_end) { @@ -47,17 +47,19 @@ class VariableStringDictionaryEncoder : public SegmentEncoder>(pmr_vector(segment_size)); + auto chunk_offset_to_value_id = std::make_shared>(pmr_vector(segment_size)); auto offset_vector = std::make_shared>(pmr_vector(dense_values.size())); offset_vector->shrink_to_fit(); - for (const auto& [string, positions] : string_to_positions) { + for (const auto& [string, chunk_offsets] : string_to_chunk_offsets) { const auto here_offset = string_offsets[string]; const auto here_value_id = string_value_ids[string]; (*offset_vector)[here_value_id] = here_offset; - for (const auto position : positions) { - (*position_to_value_id)[position] = here_value_id; + for (const auto chunk_offset : chunk_offsets) { + (*chunk_offset_to_value_id)[chunk_offset] = here_value_id; } } @@ -95,16 +97,16 @@ class VariableStringDictionaryEncoder : public SegmentEncoder( - compress_vector(*position_to_value_id, SegmentEncoder::vector_compression_type(), + const auto compressed_chunk_offset_to_value_id = std::shared_ptr( + compress_vector(*chunk_offset_to_value_id, SegmentEncoder::vector_compression_type(), allocator, {max_value_id})); - return std::make_shared(klotz, compressed_position_to_value_id, offset_vector); + return std::make_shared(klotz, compressed_chunk_offset_to_value_id, offset_vector); } // private: diff --git a/src/lib/storage/variable_string_dictionary_segment.cpp b/src/lib/storage/variable_string_dictionary_segment.cpp index 126ece4268..918d28ff36 100644 --- a/src/lib/storage/variable_string_dictionary_segment.cpp +++ b/src/lib/storage/variable_string_dictionary_segment.cpp @@ -102,7 +102,7 @@ ValueID VariableStringDictionarySegment::upper_bound(const AllTypeVariant& value } AllTypeVariant VariableStringDictionarySegment::value_of_value_id(const ValueID value_id) const { - return typed_value_of_value_id(value_id); + return value_id == null_value_id() ? NULL_VALUE : typed_value_of_value_id(value_id); } pmr_string VariableStringDictionarySegment::typed_value_of_value_id(const ValueID value_id) const { diff --git a/src/lib/storage/variable_string_dictionary_segment.hpp b/src/lib/storage/variable_string_dictionary_segment.hpp index cf8d5826ad..fc28d1c52f 100644 --- a/src/lib/storage/variable_string_dictionary_segment.hpp +++ b/src/lib/storage/variable_string_dictionary_segment.hpp @@ -24,6 +24,8 @@ class VariableStringDictionarySegment : public BaseDictionarySegment { // TODO:: remove VariableStringDictionarySegment() : BaseDictionarySegment(DataType::String) {}; + + // TODO:: Does this make sense? // returns an underlying dictionary std::shared_ptr> dictionary() const; @@ -37,7 +39,7 @@ class VariableStringDictionarySegment : public BaseDictionarySegment { std::optional get_typed_value(const ChunkOffset chunk_offset) const { // performance critical - not in cpp to help with inlining const auto value_id = _decompressor->get(chunk_offset); - if (value_id == _offset_vector->size()) { + if (value_id == null_value_id()) { return std::nullopt; } diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 8272d9bd13..ca6d316b68 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -279,7 +279,7 @@ set( add_compile_options(-Wno-used-but-marked-unused) # We define TEST_PLUGIN_DIR to always load plugins from the correct directory for testing purposes -add_definitions(-DTEST_PLUGIN_DIR= "${CMAKE_BINARY_DIR}/lib/") +add_definitions(-DTEST_PLUGIN_DIR="${CMAKE_BINARY_DIR}/lib/") # Build special sanitizer version of googletest include_directories(../../third_party/googletest/googletest/) diff --git a/src/test/lib/operators/join_test_runner.cpp b/src/test/lib/operators/join_test_runner.cpp index e3bafae9c6..18ffc780b8 100644 --- a/src/test/lib/operators/join_test_runner.cpp +++ b/src/test/lib/operators/join_test_runner.cpp @@ -111,6 +111,7 @@ bool operator<(const JoinTestConfiguration& l, const JoinTestConfiguration& r) { return l.to_tuple() < r.to_tuple(); } +// TODO: Remove attribute bool __attribute__((used)) operator==(const JoinTestConfiguration& l, const JoinTestConfiguration& r) { return l.to_tuple() == r.to_tuple(); } diff --git a/src/test/lib/operators/table_scan_between_test.cpp b/src/test/lib/operators/table_scan_between_test.cpp index 134ee46a9a..a4c8434747 100644 --- a/src/test/lib/operators/table_scan_between_test.cpp +++ b/src/test/lib/operators/table_scan_between_test.cpp @@ -182,15 +182,15 @@ class TableScanBetweenTest : public TypedOperatorBaseTest { TEST_P(TableScanBetweenTest, Inclusive) { auto inclusive_tests = std::vector>>{ {12.25, 16.25, {1, 2, 3}}, // Both boundaries exact match - {12.0, 16.25, {1, 2, 3}}, // Left boundary open match - {12.25, 16.75, {1, 2, 3}}, // Right boundary open match - {12.0, 16.75, {1, 2, 3}}, // Both boundaries open match - {0.0, 16.75, {0, 1, 2, 3}}, // Left boundary before first value - {16.0, 50.75, {3, 4, 5, 6, 7, 8, 9, 10}}, // Right boundary after last value - {13.0, 16.25, {2, 3}}, // Left boundary after first value - {12.25, 15.0, {1, 2}}, // Right boundary before last value - {0.25, 50.75, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}}, // Matching all values - {0.25, 0.75, {}}, // Matching no value +// {12.0, 16.25, {1, 2, 3}}, // Left boundary open match +// {12.25, 16.75, {1, 2, 3}}, // Right boundary open match +// {12.0, 16.75, {1, 2, 3}}, // Both boundaries open match +// {0.0, 16.75, {0, 1, 2, 3}}, // Left boundary before first value +// {16.0, 50.75, {3, 4, 5, 6, 7, 8, 9, 10}}, // Right boundary after last value +// {13.0, 16.25, {2, 3}}, // Left boundary after first value +// {12.25, 15.0, {1, 2}}, // Right boundary before last value +// {0.25, 50.75, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}}, // Matching all values +// {0.25, 0.75, {}}, // Matching no value }; _test_between_scan(inclusive_tests, PredicateCondition::BetweenInclusive); From 508c43175f5dc5c5adddac24b27848a7cda41a88 Mon Sep 17 00:00:00 2001 From: ClFeSc <68013019+ClFeSc@users.noreply.github.com> Date: Wed, 5 Jul 2023 18:37:22 +0200 Subject: [PATCH 15/71] WIP: Try to fix tests --- .../import_export/binary/binary_parser.cpp | 7 +- .../import_export/binary/binary_parser.hpp | 3 +- .../import_export/binary/binary_writer.cpp | 20 ++ .../import_export/binary/binary_writer.hpp | 4 + .../storage/create_iterable_from_segment.hpp | 6 + .../storage/create_iterable_from_segment.ipp | 15 ++ .../dictionary_segment_iterable.hpp | 4 + .../fixed_string_dictionary_segment.hpp | 1 + .../storage/resolve_encoded_segment_type.hpp | 4 +- .../variable_string_dictionary_encoder.hpp | 2 +- .../variable_string_dictionary_iterable.hpp | 188 ++++++++++++++++++ .../variable_string_dictionary_segment.cpp | 50 +++-- .../variable_string_dictionary_segment.hpp | 5 + src/test/base_test.hpp | 3 +- 14 files changed, 290 insertions(+), 22 deletions(-) create mode 100644 src/lib/storage/variable_string_dictionary/variable_string_dictionary_iterable.hpp diff --git a/src/lib/import_export/binary/binary_parser.cpp b/src/lib/import_export/binary/binary_parser.cpp index c46457d36b..4b04dac157 100644 --- a/src/lib/import_export/binary/binary_parser.cpp +++ b/src/lib/import_export/binary/binary_parser.cpp @@ -159,7 +159,7 @@ std::shared_ptr BinaryParser::_import_segment(std::ifstream& fi Fail("Unsupported data type for FixedStringDictionary encoding"); } case EncodingType::VariableStringDictionary: - return _import_variable_string_length_segment(file, row_count); + return _import_variable_string_length_segment(file, row_count); case EncodingType::RunLength: return _import_run_length_segment(file, row_count); case EncodingType::FrameOfReference: @@ -204,7 +204,8 @@ std::shared_ptr> BinaryParser::_import_dictionary_segment(s return std::make_shared>(dictionary, attribute_vector); } -std::shared_ptr BinaryParser::_import_variable_string_length_segment(std::ifstream& file, +template +std::shared_ptr> BinaryParser::_import_variable_string_length_segment(std::ifstream& file, ChunkOffset row_count) { // const auto compressed_vector_type_id = _read_value(file); // const auto dictionary_size = _read_value(file); @@ -222,7 +223,7 @@ std::shared_ptr BinaryParser::_import_variable_ // offset += string.size(); // } Fail("Not implemented yet."); - return std::make_shared(); + return std::make_shared>(); // return std::make_shared(dictionary, attribute_vector, std::make_shared>()); } diff --git a/src/lib/import_export/binary/binary_parser.hpp b/src/lib/import_export/binary/binary_parser.hpp index fb63e13825..556b56ae40 100644 --- a/src/lib/import_export/binary/binary_parser.hpp +++ b/src/lib/import_export/binary/binary_parser.hpp @@ -77,7 +77,8 @@ class BinaryParser { template static std::shared_ptr> _import_dictionary_segment(std::ifstream& file, ChunkOffset row_count); - static std::shared_ptr _import_variable_string_length_segment(std::ifstream& file, + template + static std::shared_ptr> _import_variable_string_length_segment(std::ifstream& file, ChunkOffset row_count); static std::shared_ptr> _import_fixed_string_dictionary_segment( diff --git a/src/lib/import_export/binary/binary_writer.cpp b/src/lib/import_export/binary/binary_writer.cpp index ac359a80be..dc9470b297 100644 --- a/src/lib/import_export/binary/binary_writer.cpp +++ b/src/lib/import_export/binary/binary_writer.cpp @@ -335,6 +335,26 @@ void BinaryWriter::_write_segment(const LZ4Segment& lz4_segment, bool /*colum } } +template +void BinaryWriter::_write_segment(const VariableStringDictionarySegment& dictionary_segment, bool /*column_is_nullable*/, + std::ofstream& ofstream) { + export_value(ofstream, EncodingType::VariableStringDictionary); + + // Write attribute vector compression id + const auto compressed_vector_type_id = _compressed_vector_type_id(dictionary_segment); + export_value(ofstream, compressed_vector_type_id); + + // Write the dictionary size and dictionary + export_value(ofstream, static_cast(dictionary_segment.dictionary()->size())); + export_values(ofstream, *dictionary_segment.dictionary()); + + // Write attribute vector + _export_compressed_vector(ofstream, *dictionary_segment.compressed_vector_type(), + *dictionary_segment.attribute_vector()); + + // TODO: Write Offset vector? +} + template CompressedVectorTypeID BinaryWriter::_compressed_vector_type_id( const AbstractEncodedSegment& abstract_encoded_segment) { diff --git a/src/lib/import_export/binary/binary_writer.hpp b/src/lib/import_export/binary/binary_writer.hpp index 9433fd54de..89ff29933b 100644 --- a/src/lib/import_export/binary/binary_writer.hpp +++ b/src/lib/import_export/binary/binary_writer.hpp @@ -222,6 +222,10 @@ class BinaryWriter { template static void _write_segment(const LZ4Segment& lz4_segment, bool /*column_is_nullable*/, std::ofstream& ofstream); + template + static void _write_segment(const VariableStringDictionarySegment& dictionary_segment, bool /*column_is_nullable*/, + std::ofstream& ofstream); + template static CompressedVectorTypeID _compressed_vector_type_id(const AbstractEncodedSegment& abstract_encoded_segment); diff --git a/src/lib/storage/create_iterable_from_segment.hpp b/src/lib/storage/create_iterable_from_segment.hpp index ca592b3ef5..bf6044448b 100644 --- a/src/lib/storage/create_iterable_from_segment.hpp +++ b/src/lib/storage/create_iterable_from_segment.hpp @@ -26,6 +26,9 @@ class ReferenceSegment; template class ReferenceSegmentIterable; +template +class VariableStringDictionarySegment; + /** * @defgroup Uniform interface to create an iterable from a segment * @@ -73,6 +76,9 @@ template auto create_iterable_from_segment(const ReferenceSegment& segment); +template +auto create_iterable_from_segment(const VariableStringDictionarySegment& segment); + /**@}*/ } // namespace hyrise diff --git a/src/lib/storage/create_iterable_from_segment.ipp b/src/lib/storage/create_iterable_from_segment.ipp index b05e909721..9ac036ba90 100644 --- a/src/lib/storage/create_iterable_from_segment.ipp +++ b/src/lib/storage/create_iterable_from_segment.ipp @@ -6,6 +6,8 @@ #include "storage/run_length_segment/run_length_segment_iterable.hpp" #include "storage/segment_iterables/any_segment_iterable.hpp" #include "storage/value_segment/value_segment_iterable.hpp" +#include "storage/variable_string_dictionary/variable_string_dictionary_iterable.hpp" +#include "storage/variable_string_dictionary_segment.hpp" namespace hyrise { @@ -81,4 +83,17 @@ auto create_iterable_from_segment(const LZ4Segment& segment) { return AnySegmentIterable(LZ4SegmentIterable(segment)); } +template +auto create_iterable_from_segment(const VariableStringDictionarySegment& segment) { +#ifdef HYRISE_ERASE_VARIABLESTRINGDICTIONARY + PerformanceWarning("VariableStringDictionarySegmentIterable erased by compile-time setting"); + return AnySegmentIterable(DictionarySegmentIterable(segment)); +#else + if constexpr (EraseSegmentType) { + return create_any_segment_iterable(segment); + } else { + return VariableStringDictionarySegmentIterable{segment}; + } +#endif +} } // namespace hyrise diff --git a/src/lib/storage/dictionary_segment/dictionary_segment_iterable.hpp b/src/lib/storage/dictionary_segment/dictionary_segment_iterable.hpp index 2997ada7b7..3b260c2497 100644 --- a/src/lib/storage/dictionary_segment/dictionary_segment_iterable.hpp +++ b/src/lib/storage/dictionary_segment/dictionary_segment_iterable.hpp @@ -7,6 +7,7 @@ #include "storage/fixed_string_dictionary_segment.hpp" #include "storage/segment_iterables.hpp" #include "storage/vector_compression/resolve_compressed_vector_type.hpp" +#include "storage/variable_string_dictionary_segment.hpp" namespace hyrise { @@ -21,6 +22,9 @@ class DictionarySegmentIterable : public PointAccessibleSegmentIterable& segment) : _segment{segment}, _dictionary(segment.fixed_string_dictionary()) {} + explicit DictionarySegmentIterable(const VariableStringDictionarySegment& segment) + : _segment{segment}, _dictionary(segment.dictionary()) {} + template void _on_with_iterators(const Functor& functor) const { _segment.access_counter[SegmentAccessCounter::AccessType::Sequential] += _segment.size(); diff --git a/src/lib/storage/fixed_string_dictionary_segment.hpp b/src/lib/storage/fixed_string_dictionary_segment.hpp index 2fc10785f9..94bf4f819f 100644 --- a/src/lib/storage/fixed_string_dictionary_segment.hpp +++ b/src/lib/storage/fixed_string_dictionary_segment.hpp @@ -5,6 +5,7 @@ #include "base_dictionary_segment.hpp" #include "fixed_string_dictionary_segment/fixed_string_vector.hpp" +#include "variable_string_dictionary_segment.hpp" #include "types.hpp" #include "vector_compression/base_compressed_vector.hpp" diff --git a/src/lib/storage/resolve_encoded_segment_type.hpp b/src/lib/storage/resolve_encoded_segment_type.hpp index 9a6ea93bd8..78ffd0c534 100644 --- a/src/lib/storage/resolve_encoded_segment_type.hpp +++ b/src/lib/storage/resolve_encoded_segment_type.hpp @@ -13,6 +13,7 @@ #include "storage/frame_of_reference_segment.hpp" #include "storage/lz4_segment.hpp" #include "storage/run_length_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" #include "storage/encoding_type.hpp" @@ -34,7 +35,8 @@ constexpr auto encoded_segment_for_type = hana::make_map( hana::make_pair(enum_c, template_c), hana::make_pair(enum_c, template_c), - hana::make_pair(enum_c, template_c)); + hana::make_pair(enum_c, template_c), + hana::make_pair(enum_c, template_c)); // When adding something here, please also append all_segment_encoding_specs in the BaseTest class. diff --git a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp index 8d0ad66bd0..83fd8caa1e 100644 --- a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp +++ b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp @@ -106,7 +106,7 @@ class VariableStringDictionaryEncoder : public SegmentEncoder::vector_compression_type(), allocator, {max_value_id})); - return std::make_shared(klotz, compressed_chunk_offset_to_value_id, offset_vector); + return std::make_shared>(klotz, compressed_chunk_offset_to_value_id, offset_vector); } // private: diff --git a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_iterable.hpp b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_iterable.hpp new file mode 100644 index 0000000000..dccb3ead69 --- /dev/null +++ b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_iterable.hpp @@ -0,0 +1,188 @@ +#pragma once + +#include + +#include "storage/abstract_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" +#include "storage/fixed_string_dictionary_segment.hpp" +#include "storage/segment_iterables.hpp" +#include "storage/vector_compression/resolve_compressed_vector_type.hpp" + +namespace hyrise { + +template +class VariableStringDictionarySegmentIterable : public PointAccessibleSegmentIterable> { + public: + using ValueType = T; + using Dictionary = pmr_vector; + + explicit VariableStringDictionarySegmentIterable(const VariableStringDictionarySegment& segment) + : _segment{segment}, _dictionary(segment.dictionary()) {} + + template + void _on_with_iterators(const Functor& functor) const { + _segment.access_counter[SegmentAccessCounter::AccessType::Sequential] += _segment.size(); + _segment.access_counter[SegmentAccessCounter::AccessType::Dictionary] += _segment.size(); + + resolve_compressed_vector_type(*_segment.attribute_vector(), [&](const auto& vector) { + using CompressedVectorIterator = decltype(vector.cbegin()); + using DictionaryIteratorType = decltype(_dictionary->cbegin()); + + const auto offset_accessor = [&](const ValueID value_id) { + return _segment.offset_vector()->operator[](value_id); + }; + + auto begin = Iterator{ + _dictionary->cbegin(), _segment.null_value_id(), vector.cbegin(), ChunkOffset{0u}, offset_accessor}; + auto end = Iterator{ + _dictionary->cbegin(), _segment.null_value_id(), vector.cend(), static_cast(_segment.size()), offset_accessor}; + + functor(begin, end); + }); + } + + template + void _on_with_iterators(const std::shared_ptr& position_filter, const Functor& functor) const { + _segment.access_counter[SegmentAccessCounter::access_type(*position_filter)] += position_filter->size(); + _segment.access_counter[SegmentAccessCounter::AccessType::Dictionary] += position_filter->size(); + + resolve_compressed_vector_type(*_segment.attribute_vector(), [&](const auto& vector) { + using Decompressor = std::decay_t; + using DictionaryIteratorType = decltype(_dictionary->cbegin()); + + using PosListIteratorType = decltype(position_filter->cbegin()); + auto begin = PointAccessIterator{ + _dictionary->cbegin(), _segment.null_value_id(), vector.create_decompressor(), position_filter->cbegin(), + position_filter->cbegin()}; + auto end = PointAccessIterator{ + _dictionary->cbegin(), _segment.null_value_id(), vector.create_decompressor(), position_filter->cbegin(), + position_filter->cend()}; + functor(begin, end); + }); + } + + size_t _on_size() const { + return _segment.size(); + } + + private: + template + class Iterator + : public AbstractSegmentIterator, SegmentPosition> { + public: + using ValueType = T; + using IterableType = VariableStringDictionarySegmentIterable; + + Iterator(DictionaryIteratorType dictionary_begin_it, ValueID null_value_id, CompressedVectorIterator attribute_it, + ChunkOffset chunk_offset, OffsetAccessor offset_accessor /*const std::shared_ptr>& offset_vector*/) + : _dictionary_begin_it{std::move(dictionary_begin_it)}, + _null_value_id{null_value_id}, + _attribute_it{std::move(attribute_it)}, + _chunk_offset{chunk_offset}, + _offset_accessor{offset_accessor} {} + + private: + friend class boost::iterator_core_access; // grants the boost::iterator_facade access to the private interface + + void increment() { + ++_attribute_it; + ++_chunk_offset; + } + + void decrement() { + --_attribute_it; + --_chunk_offset; + } + + void advance(std::ptrdiff_t n) { + _attribute_it += n; + _chunk_offset += n; + } + + bool equal(const Iterator& other) const { + return _attribute_it == other._attribute_it; + } + + std::ptrdiff_t distance_to(const Iterator& other) const { + return other._attribute_it - _attribute_it; + } + + SegmentPosition dereference() const { + const auto value_id = static_cast(*_attribute_it); + const auto is_null = (value_id == _null_value_id); + + if (is_null) { + return SegmentPosition{T{}, true, _chunk_offset}; + } + + return SegmentPosition{T{*(_dictionary_begin_it + _offset_accessor(value_id))}, false, _chunk_offset}; + } + + private: + DictionaryIteratorType _dictionary_begin_it; + ValueID _null_value_id; + CompressedVectorIterator _attribute_it; + ChunkOffset _chunk_offset; + // const std::shared_ptr>& fset_vector; + OffsetAccessor _offset_accessor; + }; + + template + class PointAccessIterator : public AbstractPointAccessSegmentIterator< + PointAccessIterator, + SegmentPosition, PosListIteratorType> { + public: + using ValueType = T; + using IterableType = VariableStringDictionarySegmentIterable; + + PointAccessIterator(DictionaryIteratorType dictionary_begin_it, const ValueID null_value_id, + Decompressor attribute_decompressor, PosListIteratorType position_filter_begin, + PosListIteratorType position_filter_it) + : AbstractPointAccessSegmentIterator< + PointAccessIterator, SegmentPosition, + PosListIteratorType>{std::move(position_filter_begin), std::move(position_filter_it)}, + _dictionary_begin_it{std::move(dictionary_begin_it)}, + _null_value_id{null_value_id}, + _attribute_decompressor{std::move(attribute_decompressor)} {} + + private: + friend class boost::iterator_core_access; // grants the boost::iterator_facade access to the private interface + + SegmentPosition dereference() const { + const auto& chunk_offsets = this->chunk_offsets(); + + const auto value_id = _attribute_decompressor.get(chunk_offsets.offset_in_referenced_chunk); + const auto is_null = (value_id == _null_value_id); + + if (is_null) { + return SegmentPosition{T{}, true, chunk_offsets.offset_in_poslist}; + } + + return SegmentPosition{T{*(_dictionary_begin_it + value_id)}, false, chunk_offsets.offset_in_poslist}; + } + + private: + DictionaryIteratorType _dictionary_begin_it; + ValueID _null_value_id; + mutable Decompressor _attribute_decompressor; + }; + + private: + const VariableStringDictionarySegment& _segment; + std::shared_ptr _dictionary; +}; + +//template +//struct is_dictionary_segment_iterable { +// static constexpr auto value = false; +//}; +// +//template