diff --git a/resources/test_data/bin/VariableStringDictionaryMultipleChunks.bin b/resources/test_data/bin/VariableStringDictionaryMultipleChunks.bin new file mode 100644 index 0000000000..fc32e3182a Binary files /dev/null and b/resources/test_data/bin/VariableStringDictionaryMultipleChunks.bin differ diff --git a/resources/test_data/bin/VariableStringDictionaryNullValue.bin b/resources/test_data/bin/VariableStringDictionaryNullValue.bin new file mode 100644 index 0000000000..0601134cc0 Binary files /dev/null and b/resources/test_data/bin/VariableStringDictionaryNullValue.bin differ diff --git a/resources/test_data/bin/VariableStringDictionarySingleChunk.bin b/resources/test_data/bin/VariableStringDictionarySingleChunk.bin new file mode 100644 index 0000000000..095e7da3c7 Binary files /dev/null and b/resources/test_data/bin/VariableStringDictionarySingleChunk.bin differ diff --git a/scripts/evaluate_string_segments.py b/scripts/evaluate_string_segments.py new file mode 100755 index 0000000000..57542026d0 --- /dev/null +++ b/scripts/evaluate_string_segments.py @@ -0,0 +1,837 @@ +#!/usr/bin/env python3 +import argparse +import json +import shutil +from abc import ABC, abstractmethod +from argparse import ArgumentParser, ArgumentTypeError, BooleanOptionalAction +from dataclasses import dataclass +import multiprocessing +import os +from pathlib import Path +import statistics +from subprocess import check_output, CalledProcessError +import sys +from datetime import datetime +from typing import Any, Literal, Mapping + +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +DEFAULT_ENCODINGS = ["Unencoded", "Dictionary", "RunLength", "FixedStringDictionary", "LZ4"] + + +def print_error(*args, **kwargs) -> None: + print(*args, file=sys.stderr, **kwargs) + + +def rm_dir(path: str) -> None: + dirpath = Path(path) + if dirpath.exists() and dirpath.is_dir(): + shutil.rmtree(dirpath) + + +def read_json(path: str) -> dict: + with open(path) as file: + return json.load(file) + + +def flatten(x: list[list], /) -> list: + return [item for sublist in x for item in sublist] + + +class DictConvertible(ABC): + @abstractmethod + def as_dict(self) -> dict: + ... + + +@dataclass(frozen=True, slots=True) +class Runtime(DictConvertible): + benchmark_name: str + durations: list[float] + + def as_dict(self) -> dict: + return { + "benchmark_name": self.benchmark_name, + "durations": self.durations, + } + + def min(self) -> float: + return min(self.durations) + + def max(self) -> float: + return max(self.durations) + + def average(self) -> float: + return statistics.fmean(self.durations) + + def median(self) -> float: + return statistics.median(self.durations) + + +@dataclass(slots=True) +class Runtimes(DictConvertible): + runtimes: list[Runtime] + encoding: str + benchmark_name: str + threading: str + + def as_dict(self) -> dict: + return {"runtimes": list(runtime.as_dict() for runtime in self.runtimes)} + + def min(self) -> float: + return min(map(lambda x: x.median(), self.runtimes)) + + def max(self) -> float: + return max(map(lambda x: x.median(), self.runtimes)) + + def average(self) -> float: + return statistics.fmean(map(lambda x: x.median(), self.runtimes)) + + def median(self) -> float: + return statistics.median(map(lambda x: x.median(), self.runtimes)) + + @classmethod + def from_json(cls, json_path: str) -> "Runtimes": + json_data = read_json(json_path) + runtimes = cls( + [], f"{json_data['branch']}-{json_data['encoding']}", json_data["benchmark_name"], json_data["threading"] + ) + json_data = json_data["benchmark"] + for benchmark in json_data["benchmarks"]: + name = benchmark["name"] + durations: list[float] = [run["duration"] for run in benchmark["successful_runs"]] + runtime = Runtime(name, durations) + runtimes.runtimes.append(runtime) + return runtimes + + +@dataclass(frozen=True, slots=True) +class MemoryConsumption(DictConvertible): + column_name: str + column_type: str + memory_consumption: int + + def as_dict(self) -> dict: + return { + "column_name": self.column_name, + "column_type": self.column_type, + "memory_consumption": self.memory_consumption, + } + + +@dataclass(slots=True) +class Metrics(DictConvertible): + memory_consumptions: list[MemoryConsumption] + encoding: str + benchmark_name: str + + def as_dict(self) -> dict: + return {"memory_consumption": list(consumption.as_dict() for consumption in self.memory_consumptions)} + + def _as_generator(self, *, only_string_columns: bool): + return ( + consumption + for consumption in self.memory_consumptions + if not only_string_columns or consumption.column_type == "string" + ) + + def min(self, *, only_string_columns: bool) -> int: + return min(map(lambda x: x.memory_consumption, self._as_generator(only_string_columns=only_string_columns))) + + def max(self, *, only_string_columns: bool) -> int: + return max(map(lambda x: x.memory_consumption, self._as_generator(only_string_columns=only_string_columns))) + + def average(self, *, only_string_columns: bool) -> float: + return statistics.fmean( + map(lambda x: x.memory_consumption, self._as_generator(only_string_columns=only_string_columns)) + ) + + def median(self, *, only_string_columns: bool) -> float: + return statistics.median( + map(lambda x: x.memory_consumption, self._as_generator(only_string_columns=only_string_columns)) + ) + + def sum(self, *, only_string_columns: bool) -> int: + return sum(map(lambda x: x.memory_consumption, self._as_generator(only_string_columns=only_string_columns))) + + @classmethod + def from_json(cls, json_path: str) -> "Metrics": + json_data = read_json(json_path) + metrics = cls([], f"{json_data['branch']}-{json_data['encoding']}", json_data["benchmark_name"]) + json_data = json_data["benchmark"] + for segment in json_data["segments"]: + # Only consider moment == "init". + if segment["moment"] != "init": + continue + column_type = segment["column_data_type"] + column_name = segment["column_name"] + estimated_size = segment["estimated_size_in_bytes"] + metrics.memory_consumptions.append(MemoryConsumption(column_name, column_type, estimated_size)) + return metrics + + +def clean_encoding_name(encoding: str) -> str: + return ( + encoding_without_branch + if (encoding_without_branch := encoding.split("-")[-1]) in DEFAULT_ENCODINGS + or encoding_without_branch.split(" ")[0] in DEFAULT_ENCODINGS + else encoding + ) + + +def plot(results: dict[str, list], *, title: str, yaxis: str, path: str, figsize: tuple[int, int] = (15, 10)) -> None: + f, axis = plt.subplots(1, 1, figsize=figsize) + # The transposing of the orientation is done to allow for empty cells. + data = pd.DataFrame.from_dict(results, orient="index") + data = data.transpose() + data = data.rename(clean_encoding_name, axis="columns") + if data.empty: + print_error("Data Frame is empty; no result data to show!") + return + data.plot( + kind="box", + ax=axis, + title=title, + xlabel="Encodings", + ylabel=f"{yaxis} (Logarithmic Scale)", + logy=True, + ) + axis.set_xticklabels(axis.get_xticklabels(), rotation=-45) + f.tight_layout() + f.savefig(path) + + +def refine_stats( + stats: dict[str, tuple[Mapping[Literal["ST", "MT"], Mapping[str, Runtimes]], Mapping[str, Metrics]]] +) -> dict: + result = {"ENCODING": [], "RUNTIME": [], "MODE": [], "SIZE": [], "BENCHMARK": []} + for benchmark_name, benchmark_results in stats.items(): + for threading, runtimes in benchmark_results[0].items(): + for encoding, runtime_wrapper in runtimes.items(): + runtime = runtime_wrapper.average() / 1e9 # Given in nanoseconds, return seconds + metrics = benchmark_results[1][encoding] + size = metrics.sum(only_string_columns=True) / 1e6 # Given in Bytes, return MB + result["SIZE"].append(size) + result["RUNTIME"].append(runtime) + result["MODE"].append(threading) + cleaned_encoding_name = clean_encoding_name(encoding) + result["ENCODING"].append(cleaned_encoding_name) + result["BENCHMARK"].append(benchmark_name) + return result + + +def plot_stats( + stats: dict[str, tuple[Mapping[Literal["ST", "MT"], Mapping[str, Runtimes]], Mapping[str, Metrics]]], + *, + path: str, + sharex: bool, + sharey: bool, +) -> None: + data = pd.DataFrame.from_dict(refine_stats(stats)) + data = data.sort_values(["BENCHMARK", "ENCODING"]) + g = sns.FacetGrid(data, col="BENCHMARK", row="MODE", hue="ENCODING", sharex=sharex, sharey=sharey) + g.map(sns.scatterplot, "SIZE", "RUNTIME") + g.set(xscale="log") + g.add_legend() + g.set_axis_labels("Memory Consumption [Sum MB] (Log)", "Duration [Avg Seconds]") + g.tight_layout() + g.savefig(path) + + +def plot_query_timings( + stats: list[Runtimes], *, benchmark_name: str, path: str, figsize: tuple[int, int] = (30, 10) +) -> None: + stats = [stat for stat in stats if stat.threading == "ST"] + grouped: dict[str, list[Runtimes]] = Evaluation.group_by(stats, "encoding") + stats_grouped_by_encoding = { + encoding: { + runtime.benchmark_name: runtime.median() / 1e9 # Provided in ns, wanted s + for runtime in flatten(list(map(lambda x: x.runtimes, runtimes))) + } + for encoding, runtimes in grouped.items() + } + data = pd.DataFrame.from_dict(stats_grouped_by_encoding) + data = data.rename(clean_encoding_name, axis="columns") + data = data.reindex(sorted(data.columns), axis=1) + f, axis = plt.subplots(1, 1, figsize=figsize) + if data.empty: + print_error("Data Frame is empty; no result data to show!") + return + data.plot( + kind="bar", + ax=axis, + title=f"Query times for benchmark {benchmark_name} (ST)", + xlabel="Queries", + ylabel="Median Runtime (seconds) across all runs of the query (Logarithmic Scale)", + logy=True, + ) + f.tight_layout() + f.savefig(path) + + +class Benchmarking: + @dataclass(frozen=True) + class Config: + benchmarks: list[str] + encodings: list[str] + build_path: str + threading: Literal["ST", "MT", "both"] + time_limit: int + scale_factor: float + tmp_path: str + metrics: bool + + @classmethod + def from_namespace(cls, namespace: argparse.Namespace) -> "Benchmarking.Config": + encodings = flatten(namespace.encodings) + if namespace.default_encodings: + encodings.extend(DEFAULT_ENCODINGS) + if len(encodings) == 0: + exit("No encodings to test") + return cls( + benchmarks=flatten(namespace.benchmarks), + encodings=encodings, + build_path=namespace.build_path, + threading=namespace.threading, + time_limit=namespace.time_limit, + scale_factor=namespace.scale_factor, + tmp_path=namespace.tmp_path, + metrics=namespace.metrics, + ) + + @staticmethod + def register_arguments(parser: ArgumentParser) -> ArgumentParser: + def check_positive(value) -> int: + ivalue = int(value) + if ivalue <= 0: + raise ArgumentTypeError("%s is an invalid positive int value" % value) + return ivalue + + parser.add_argument( + "-b", + "--benchmark", + action="append", + dest="benchmarks", + nargs="*", + help="A benchmark (e.g. hyriseBenchmarkTPCH) to run.", + ) + parser.add_argument( + "-e", + "--encoding", + action="append", + dest="encodings", + nargs="*", + default=[], + help="An encoding that should be tested.", + ) + parser.add_argument( + "-p", + "--build-path", + dest="build_path", + type=str, + required=True, + help="Path where the executables to benchmark are located.", + ) + parser.add_argument( + "-t", + "--threading", + dest="threading", + type=str, + required=False, + default="both", + help="Which threading should be executed. Variants: ST, MT, both", + ) + parser.add_argument( + "-l", + "--timeout", + dest="time_limit", + type=check_positive, + required=False, + default=60, + help="The timeout in seconds to pass to the benchmarks. Defaults to 60.", + ) + parser.add_argument( + "-s", + "--scale-factor", + dest="scale_factor", + type=float, + required=True, + help="The scale factor to pass to the benchmarks that support scaling. Note that this number might get " + "rounded or ignored if necessary for a benchmark.", + ) + parser.add_argument( + "--tmp-path", + dest="tmp_path", + type=str, + required=False, + default="tmp", + help="The directory where the benchmark result files will be stored.", + ) + parser.add_argument( + "-d", + "--default-encodings", + dest="default_encodings", + action=BooleanOptionalAction, + default=False, + help="Whether to run benchmarks for all default encodings.", + ) + parser.add_argument( + "-m", + "--metrics", + dest="metrics", + action=BooleanOptionalAction, + default=False, + help="Whether to run metrics benchmarks or timing benchmarks.", + ) + return parser + + @staticmethod + def run(config: Config) -> list[Path]: + """ + Runs all benchmarks as outlined in the passed-in config. + + Returns a list of paths of output files, one for each benchmark. + """ + + @dataclass(frozen=True) + class BenchmarkConfig(ABC): + encoding: str + threading: Literal["ST", "MT"] + time_limit: int + scale_factor: float + metrics: bool + + class BenchmarkRunner: + name = "Benchmark" + + def __init__(self, path: str, tmp_path: str): + self._config: BenchmarkConfig | None = None + self._path = path + self._tmp_path = tmp_path + + @staticmethod + def create(name: str, *args: Any, **kwargs: Any) -> "BenchmarkRunner": + classes = { + "hyriseBenchmarkTPCH": TpcHBenchmarkRunner, + "hyriseBenchmarkTPCDS": TpcDsBenchmarkRunner, + "hyriseBenchmarkJoinOrder": JobBenchmarkRunner, + "hyriseBenchmarkStarSchema": SsbBenchmarkRunner, + } + return classes[name](*args, **kwargs) + + def _write_encoding_config_file(self, threading: Literal["ST", "MT"], encoding: str, metrics: bool) -> str: + """ + Create a config file as depicted in the `--full_help` of the benchmarks and return its path. + """ + config_path = os.path.join( + self._tmp_path, "config", f"{self.name}-{threading}-{encoding}-{metrics}.json" + ) + config_contents = {"default": {"encoding": "Dictionary"}, "type": {"string": {"encoding": encoding}}} + with open(config_path, mode="w") as config_file: + json.dump(config_contents, config_file) + return config_path + + def run(self, benchmark_config: BenchmarkConfig) -> Path: + self._config = benchmark_config + return Path(self._run(self._config.threading, self._config.encoding, self._config.metrics)) + + def _run(self, threading: Literal["ST", "MT"], encoding: str, metrics: bool) -> str: + self._pre_run_cleanup() + st_command = self._get_arguments(threading, encoding, metrics) + check_output(st_command) + return self._output_path(threading, encoding, metrics) + + def _get_arguments(self, threading: Literal["ST", "MT"], encoding: str, metrics: bool) -> list[str]: + encoding_config_path = self._write_encoding_config_file(threading, encoding, metrics) + arguments = [ + self._path, + "-o", + self._output_path(threading, encoding, metrics), + "-e", + encoding_config_path, + ] + if threading == "MT": + arguments += ["--scheduler", "--clients", str(multiprocessing.cpu_count() // 4), "--mode=Shuffled"] + # Multithreaded runs need longer times to be meaningful. Default to 20 minutes. + arguments += ["-t", str(self._config.time_limit * 20)] + else: + arguments += ["-t", str(self._config.time_limit)] + if metrics: + arguments += ["--system_metrics", "-r", "1"] + return arguments + + def _output_path(self, threading: Literal["ST"] | Literal["MT"], encoding: str, metrics: bool) -> str: + try: + git_commit = check_output(["git", "rev-parse", "HEAD"]) + except CalledProcessError: + git_commit = b"" + return os.path.join( + self._tmp_path, + f'{git_commit.decode("utf-8").strip()}-{self.name}-{threading}-{encoding}-{metrics}.json', + ) + + @abstractmethod + def _pre_run_cleanup(self): + ... + + class TpcHBenchmarkRunner(BenchmarkRunner): + name = "TPC-H" + + def _pre_run_cleanup(self) -> None: + rm_dir("tpch_cached_tables") + + def _get_arguments(self, threading: Literal["ST", "MT"], encoding: str, metrics: bool) -> list[str]: + return super()._get_arguments(threading, encoding, metrics) + ["-s", str(self._config.scale_factor)] + + class TpcDsBenchmarkRunner(BenchmarkRunner): + name = "TPC-DS" + + def _pre_run_cleanup(self) -> None: + rm_dir("tpcds_cached_tables") + + def _get_arguments(self, threading: Literal["ST", "MT"], encoding: str, metrics: bool) -> list[str]: + # TPC-DS only supports integer scales. + return super()._get_arguments(threading, encoding, metrics) + [ + "-s", + str(max(1, int(self._config.scale_factor))), + ] + + class JobBenchmarkRunner(BenchmarkRunner): + name = "JOB" + + def _pre_run_cleanup(self) -> None: + rm_dir("imdb_data") + + class SsbBenchmarkRunner(BenchmarkRunner): + name = "SSB" + + def _pre_run_cleanup(self) -> None: + rm_dir("imdb_data") + + def locate_benchmarks(benchmark_names: list[str], build_path: str, tmp_path: str) -> list[BenchmarkRunner]: + benchmark_objects: list[BenchmarkRunner] = [] + for benchmark_name in benchmark_names: + benchmark_path = os.path.join(build_path, benchmark_name) + if not os.path.isfile(benchmark_path): + exit(f"Cannot locate {benchmark_name} at {benchmark_path}!") + benchmark_objects.append(BenchmarkRunner.create(benchmark_name, benchmark_path, tmp_path)) + return benchmark_objects + + (Path(config.tmp_path) / Path("config")).mkdir(parents=True, exist_ok=True) + benchmarks = locate_benchmarks(config.benchmarks, config.build_path, config.tmp_path) + result_paths: list[Path] = [] + threading_modes: list[Literal["ST", "MT"]] = ["ST", "MT"] if config.threading == "both" else [config.threading] + if config.metrics: + threading_modes = ["ST"] + total_runs = len(benchmarks) * len(config.encodings) * len(threading_modes) + current_benchmark = 0 + for benchmark in benchmarks: + for encoding_to_benchmark in config.encodings: + for threading_mode in threading_modes: + current_benchmark += 1 + print(f"Running benchmark {current_benchmark}/{total_runs}") + print(f"\tCurrently running {benchmark.name} with {encoding_to_benchmark=} and {threading_mode=}") + run_config = BenchmarkConfig( + encoding=encoding_to_benchmark, + threading=threading_mode, + time_limit=config.time_limit, + scale_factor=config.scale_factor, + metrics=config.metrics, + ) + result_path = benchmark.run(run_config) + # Add encoding name to result json. + with open(result_path, mode="r") as result_json: + result_json_content = json.load(result_json) + try: + git_branch = check_output(["git", "branch", "--show-current"]) + except CalledProcessError: + git_branch = b"" + result_with_encoding = { + "benchmark": result_json_content, + "encoding": encoding_to_benchmark, + "benchmark_name": benchmark.name, + "threading": threading_mode, + "branch": git_branch.decode("utf-8").strip(), + } + with open(result_path, mode="w") as result_json: + json.dump(result_with_encoding, result_json) + + result_paths.append(result_path) + return result_paths + + +class Evaluation: + @dataclass(frozen=True) + class Config: + timing_benchmark_files: list[str] + metric_benchmark_files: list[str] + ignore_encodings: list[str] + output_directory: str + sharex: bool + sharey: bool + only_comparison: bool + + @classmethod + def from_namespace(cls, namespace: argparse.Namespace) -> "Evaluation.Config": + now = datetime.now() + now_date = f"{now.year}{now.month:02d}{now.day:02d}" + now_time = f"{now.hour}{now.minute:02d}{now.second:02d}" + output_directory = os.path.join(namespace.output_directory, f"run-{now_date}-{now_time}") + + return cls( + timing_benchmark_files=flatten(namespace.timing_benchmark_files), + metric_benchmark_files=flatten(namespace.metric_benchmark_files), + ignore_encodings=flatten(namespace.ignore_encodings), + output_directory=output_directory, + sharex=namespace.sharex, + sharey=namespace.sharey, + only_comparison=namespace.only_comparison, + ) + + @staticmethod + def register_arguments(parser: ArgumentParser) -> ArgumentParser: + parser.add_argument( + "-t", + "--timing-benchmark-files", + action="append", + dest="timing_benchmark_files", + required=True, + nargs="+", + help="All timing benchmark files to evaluate.", + ) + parser.add_argument( + "-m", + "--metric-benchmark-files", + action="append", + dest="metric_benchmark_files", + required=True, + nargs="+", + help="All timing benchmark files to evaluate.", + ) + parser.add_argument( + "-i", + "--ignore", + action="append", + dest="ignore_encodings", + required=False, + default=[], + nargs="+", + help="Encodings to ignore despite being present in the provided files.", + ) + parser.add_argument( + "-o", + "--output-directory", + dest="output_directory", + type=str, + required=True, + help="The directory where the output should be stored.", + ) + parser.add_argument( + "--share-x", + dest="sharex", + action=BooleanOptionalAction, + default=True, + help="Whether to share the x axis of the comparison plot. Defaults to True.", + ) + parser.add_argument( + "--share-y", + dest="sharey", + action=BooleanOptionalAction, + default=True, + help="Whether to share the y axis of the comparison plot. Defaults to True.", + ) + parser.add_argument( + "--only-comparison", + dest="only_comparison", + action=BooleanOptionalAction, + default=False, + help="Whether to only create the comparison plot.", + ) + return parser + + @staticmethod + def is_excluded(encoding: str, config: Config) -> bool: + return encoding.split("-")[-1] in config.ignore_encodings + + @staticmethod + def run(config: Config) -> list[Path]: + Path(config.output_directory).mkdir(parents=True, exist_ok=True) + stats: dict[str, tuple[Mapping[Literal["ST", "MT"], Mapping[str, Runtimes]], Mapping[str, Metrics]]] = {} + metric_comparisons, metric_paths = Evaluation._compare_metrics(config) + timing_comparisons, timing_paths = Evaluation._compare_timing_benchmarks(config) + metric_stats: dict[str, Mapping[str, Metrics]] = {} + for benchmark_name, metrics in metric_comparisons.items(): + metric_stats[benchmark_name] = metrics + for benchmark_name, runtimes in timing_comparisons.items(): + metrics = metric_stats[benchmark_name] + stats[benchmark_name] = runtimes, metrics + plot_path = os.path.join(config.output_directory, "comparison.svg") + plot_stats(stats, path=plot_path, sharex=config.sharex, sharey=config.sharey) + paths = metric_paths + paths.extend(timing_paths) + paths.append(Path(plot_path)) + return paths + + @staticmethod + # Source: https://stackoverflow.com/a/5695268 + def group_by(array: list, key: str) -> dict[Any, list]: + values = set(map(lambda x: x.__getattribute__(key), array)) + return {value: [y for y in array if y.__getattribute__(key) == value] for value in values} + + # Returns a tuple of a dictionary of the metrics grouped by benchmark and encoding with + # path names of the corresponding plot file and raw data file. + @staticmethod + def _compare_metrics(config: Config) -> tuple[dict[str, dict[str, Metrics]], list[Path]]: + paths: list[Path] = [] + result_jsons = config.metric_benchmark_files + metrics_list = [ + metric + for metric in [Metrics.from_json(result_json) for result_json in result_jsons] + if not Evaluation.is_excluded(metric.encoding, config) + ] + metrics_grouped_by_benchmark: dict[str, list[Metrics]] = Evaluation.group_by(metrics_list, "benchmark_name") + metrics_grouped_by_benchmark_and_encoding_list: dict[str, dict[str, list[Metrics]]] = { + benchmark: Evaluation.group_by(metrics_by_benchmark, "encoding") + for benchmark, metrics_by_benchmark in metrics_grouped_by_benchmark.items() + } + # Ensure that every benchmark-encoding combination only has one entry. + for benchmark_group in metrics_grouped_by_benchmark_and_encoding_list.values(): + for encoding_group in benchmark_group.values(): + if len(encoding_group) > 1: + raise RuntimeError(f"Too many encoding groups: {len(encoding_group)}") + metrics_grouped_by_benchmark_and_encoding: dict[str, dict[str, Metrics]] = { + benchmark: {encoding: entries[0] for encoding, entries in group.items()} + for benchmark, group in metrics_grouped_by_benchmark_and_encoding_list.items() + } + if not config.only_comparison: + for benchmark_name, metrics_by_benchmark in metrics_grouped_by_benchmark_and_encoding.items(): + metrics: dict[str, Metrics] = {encoding: metric for encoding, metric in metrics_by_benchmark.items()} + plot_path = os.path.join(config.output_directory, "metrics", "plots", f"{benchmark_name}.svg") + Path(plot_path).parent.mkdir(parents=True, exist_ok=True) + metrics_to_plot: dict[str, list[int]] = {} + for encoding, metric in metrics.items(): + metrics_to_plot[f"{encoding} (String)"] = list( + map( + lambda x: x.memory_consumption, + filter(lambda x: x.column_type == "string", metric.memory_consumptions), + ) + ) + metrics_to_plot[f"{encoding} (All)"] = list( + map(lambda x: x.memory_consumption, metric.memory_consumptions) + ) + plot( + metrics_to_plot, + title=f"Sizes for {benchmark_name}", + yaxis="Size of Segments", + path=plot_path, + figsize=(22, 10), + ) + paths.append(Path(plot_path)) + # Dumps raw data. + raw_file_path = os.path.join(config.output_directory, "metrics", "raw", f"{benchmark_name}.json") + Path(raw_file_path).parent.mkdir(parents=True, exist_ok=True) + with open(raw_file_path, "w") as f: + raw_metrics = {encoding: metric.as_dict() for encoding, metric in metrics.items()} + json.dump(raw_metrics, f) + paths.append(Path(raw_file_path)) + return metrics_grouped_by_benchmark_and_encoding, paths + + @staticmethod + def _compare_timing_benchmarks( + config: Config, + ) -> tuple[dict[str, dict[Literal["ST", "MT"], dict[str, Runtimes]]], list[Path]]: + paths: list[Path] = [] + result_jsons = config.timing_benchmark_files + timings_list = [ + timing + for timing in [Runtimes.from_json(result_json) for result_json in result_jsons] + if not Evaluation.is_excluded(timing.encoding, config) + ] + timings_grouped_by_benchmark: dict[str, list[Runtimes]] = Evaluation.group_by(timings_list, "benchmark_name") + for benchmark_name, benchmark_group in timings_grouped_by_benchmark.items(): + plot_path = os.path.join(config.output_directory, "runtime", "plots", f"{benchmark_name}-queries.svg") + Path(plot_path).parent.mkdir(parents=True, exist_ok=True) + plot_query_timings(benchmark_group, benchmark_name=benchmark_name, path=plot_path) + paths.append(Path(plot_path)) + timings_grouped_by_benchmark_and_encoding_list: dict[str, dict[str, list[Runtimes]]] = { + benchmark: Evaluation.group_by(timings_by_benchmark, "encoding") + for benchmark, timings_by_benchmark in timings_grouped_by_benchmark.items() + } + # Ensures that every benchmark-encoding combination only has two entries, one per threading. + for benchmark_group in timings_grouped_by_benchmark_and_encoding_list.values(): + for encoding_group in benchmark_group.values(): + if len(encoding_group) > 2: + exit(len(encoding_group)) + timings_grouped_by_benchmark_and_encoding: dict[str, dict[Literal["ST", "MT"], dict[str, Runtimes]]] = { + benchmark: { + "ST": { + encoding: next(filter(lambda x: x.threading == "ST", entries)) + for encoding, entries in group.items() + }, + "MT": { + encoding: next(filter(lambda x: x.threading == "MT", entries)) + for encoding, entries in group.items() + }, + } + for benchmark, group in timings_grouped_by_benchmark_and_encoding_list.items() + } + if not config.only_comparison: + threading: Literal["ST", "MT"] + for threading in ["ST", "MT"]: + for name, timing_group in timings_grouped_by_benchmark_and_encoding.items(): + times = timing_group[threading] + plot_path = os.path.join(config.output_directory, "runtime", "plots", f"{name}-{threading}.svg") + Path(plot_path).parent.mkdir(parents=True, exist_ok=True) + times_to_plot = { + encoding: list(map(lambda x: x.median(), runtimes.runtimes)) + for encoding, runtimes in times.items() + } + plot( + times_to_plot, + title=f"Median duration for {name}", + yaxis="Median runtime (in ns) across benchmark tests", + path=plot_path, + ) + paths.append(Path(plot_path)) + # Dumps raw data. + raw_file_path = os.path.join(config.output_directory, "runtime", "raw", f"{name}-{threading}.json") + Path(raw_file_path).parent.mkdir(parents=True, exist_ok=True) + with open(raw_file_path, "w") as f: + raw_times = {encoding: runtimes.as_dict() for encoding, runtimes in times.items()} + json.dump(raw_times, f) + paths.append(Path(raw_file_path)) + return timings_grouped_by_benchmark_and_encoding, paths + + +def create_argument_parser() -> ArgumentParser: + parser = ArgumentParser() + subparsers = parser.add_subparsers(help="Mode of the script", dest="command") + benchmark_parser = subparsers.add_parser("benchmark", help="Run benchmarks for Hyrise") + Benchmarking.register_arguments(benchmark_parser) + evaluation_parser = subparsers.add_parser("evaluate", help="Evaluate benchmark results") + Evaluation.register_arguments(evaluation_parser) + return parser + + +def main(): + parser = create_argument_parser() + namespace = parser.parse_args() + command = namespace.command + match command: + case "benchmark": + config = Benchmarking.Config.from_namespace(namespace) + files = Benchmarking.run(config) + print(f"Resulting files: {files}") + case "evaluate": + config = Evaluation.Config.from_namespace(namespace) + files = Evaluation.run(config) + print(f"Resulting files: {files}") + case _: + print_error(f"Could not find command '{command}'!") + exit(1) + + +if __name__ == "__main__": + main() diff --git a/src/benchmark/operators/table_scan_sorted_benchmark.cpp b/src/benchmark/operators/table_scan_sorted_benchmark.cpp index f9c781964a..458240327b 100644 --- a/src/benchmark/operators/table_scan_sorted_benchmark.cpp +++ b/src/benchmark/operators/table_scan_sorted_benchmark.cpp @@ -208,6 +208,7 @@ void registerTableScanSortedBenchmarks() { {"None", EncodingAndSupportedDataTypes(EncodingType::Unencoded, {"Int", "String"})}, {"Dictionary", EncodingAndSupportedDataTypes(EncodingType::Dictionary, {"Int", "String"})}, {"FixedStringDictionary", EncodingAndSupportedDataTypes(EncodingType::FixedStringDictionary, {"String"})}, + {"VariableStringDictionary", EncodingAndSupportedDataTypes(EncodingType::VariableStringDictionary, {"String"})}, {"FrameOfReference", EncodingAndSupportedDataTypes(EncodingType::FrameOfReference, {"Int"})}, {"RunLength", EncodingAndSupportedDataTypes(EncodingType::RunLength, {"Int", "String"})}, {"LZ4", EncodingAndSupportedDataTypes(EncodingType::LZ4, {"Int", "String"})}}; diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt index 66445dfa49..c9b50d0513 100644 --- a/src/lib/CMakeLists.txt +++ b/src/lib/CMakeLists.txt @@ -209,12 +209,12 @@ set( operators/index_scan.hpp operators/insert.cpp operators/insert.hpp - operators/join_helper/join_output_writing.cpp - operators/join_helper/join_output_writing.hpp operators/join_hash.cpp operators/join_hash.hpp operators/join_hash/join_hash_steps.hpp operators/join_hash/join_hash_traits.hpp + operators/join_helper/join_output_writing.cpp + operators/join_helper/join_output_writing.hpp operators/join_index.cpp operators/join_index.hpp operators/join_nested_loop.cpp @@ -484,6 +484,9 @@ set( storage/index/adaptive_radix_tree/adaptive_radix_tree_index.hpp storage/index/adaptive_radix_tree/adaptive_radix_tree_nodes.cpp storage/index/adaptive_radix_tree/adaptive_radix_tree_nodes.hpp + storage/index/chunk_index_statistics.cpp + storage/index/chunk_index_statistics.hpp + storage/index/chunk_index_type.hpp storage/index/group_key/composite_group_key_index.cpp storage/index/group_key/composite_group_key_index.hpp storage/index/group_key/group_key_index.cpp @@ -496,10 +499,6 @@ set( storage/index/group_key/variable_length_key_proxy.hpp storage/index/group_key/variable_length_key_store.cpp storage/index/group_key/variable_length_key_store.hpp - storage/index/chunk_index_statistics.cpp - storage/index/chunk_index_statistics.hpp - storage/index/table_index_statistics.cpp - storage/index/table_index_statistics.hpp storage/index/partial_hash/flat_map_iterator.cpp storage/index/partial_hash/flat_map_iterator.hpp storage/index/partial_hash/flat_map_iterator_impl.cpp @@ -508,7 +507,8 @@ set( storage/index/partial_hash/partial_hash_index.hpp storage/index/partial_hash/partial_hash_index_impl.cpp storage/index/partial_hash/partial_hash_index_impl.hpp - storage/index/chunk_index_type.hpp + storage/index/table_index_statistics.cpp + storage/index/table_index_statistics.hpp storage/lqp_view.cpp storage/lqp_view.hpp storage/lz4_segment.cpp @@ -560,9 +560,23 @@ set( storage/value_segment.hpp storage/value_segment/null_value_vector_iterable.hpp storage/value_segment/value_segment_iterable.hpp + storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp + storage/variable_string_dictionary/variable_string_dictionary_iterable.hpp + storage/variable_string_dictionary/variable_string_vector.cpp + storage/variable_string_dictionary/variable_string_vector.hpp + storage/variable_string_dictionary/variable_string_vector_iterator.hpp + storage/variable_string_dictionary_segment.cpp + storage/variable_string_dictionary_segment.hpp storage/vector_compression/base_compressed_vector.hpp storage/vector_compression/base_vector_compressor.hpp storage/vector_compression/base_vector_decompressor.hpp + storage/vector_compression/bitpacking/bitpacking_compressor.cpp + storage/vector_compression/bitpacking/bitpacking_compressor.hpp + storage/vector_compression/bitpacking/bitpacking_decompressor.hpp + storage/vector_compression/bitpacking/bitpacking_iterator.hpp + storage/vector_compression/bitpacking/bitpacking_vector.cpp + storage/vector_compression/bitpacking/bitpacking_vector.hpp + storage/vector_compression/bitpacking/bitpacking_vector_type.hpp storage/vector_compression/compressed_vector_type.cpp storage/vector_compression/compressed_vector_type.hpp storage/vector_compression/fixed_width_integer/fixed_width_integer_compressor.cpp @@ -571,13 +585,6 @@ set( storage/vector_compression/fixed_width_integer/fixed_width_integer_utils.hpp storage/vector_compression/fixed_width_integer/fixed_width_integer_vector.hpp storage/vector_compression/resolve_compressed_vector_type.hpp - storage/vector_compression/bitpacking/bitpacking_compressor.cpp - storage/vector_compression/bitpacking/bitpacking_compressor.hpp - storage/vector_compression/bitpacking/bitpacking_iterator.hpp - storage/vector_compression/bitpacking/bitpacking_decompressor.hpp - storage/vector_compression/bitpacking/bitpacking_vector.hpp - storage/vector_compression/bitpacking/bitpacking_vector.cpp - storage/vector_compression/bitpacking/bitpacking_vector_type.hpp storage/vector_compression/vector_compression.cpp storage/vector_compression/vector_compression.hpp strong_typedef.hpp diff --git a/src/lib/import_export/binary/binary_parser.cpp b/src/lib/import_export/binary/binary_parser.cpp index 381a8df2b1..847194c50b 100644 --- a/src/lib/import_export/binary/binary_parser.cpp +++ b/src/lib/import_export/binary/binary_parser.cpp @@ -26,6 +26,7 @@ #include "storage/table.hpp" #include "storage/table_column_definition.hpp" #include "storage/value_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" #include "storage/vector_compression/bitpacking/bitpacking_vector.hpp" #include "storage/vector_compression/bitpacking/bitpacking_vector_type.hpp" #include "storage/vector_compression/compressed_vector_type.hpp" @@ -175,6 +176,8 @@ std::shared_ptr BinaryParser::_import_segment(std::ifstream& fi } else { Fail("Unsupported data type for FixedStringDictionary encoding"); } + case EncodingType::VariableStringDictionary: + return _import_variable_string_length_segment(file, row_count); case EncodingType::RunLength: return _import_run_length_segment(file, row_count); case EncodingType::FrameOfReference: @@ -219,6 +222,24 @@ std::shared_ptr> BinaryParser::_import_dictionary_segment(s return std::make_shared>(dictionary, attribute_vector); } +template +std::shared_ptr> BinaryParser::_import_variable_string_length_segment( + std::ifstream& file, ChunkOffset row_count) { + // Read attribute vector compression type and use it to decompress. + const auto compressed_vector_type_id = _read_value(file); + const auto attribute_vector = _import_attribute_vector(file, row_count, compressed_vector_type_id); + + // Read offset vector. + const auto offset_vector_size = _read_value(file); + const auto offset_vector = std::make_shared>(_read_values(file, offset_vector_size)); + + // Read dictionary. + const auto dictionary_size = _read_value(file); + const auto dictionary = std::make_shared>(_read_values(file, dictionary_size)); + + return std::make_shared>(dictionary, attribute_vector, offset_vector); +} + std::shared_ptr> BinaryParser::_import_fixed_string_dictionary_segment( std::ifstream& file, ChunkOffset row_count) { const auto compressed_vector_type_id = _read_value(file); diff --git a/src/lib/import_export/binary/binary_parser.hpp b/src/lib/import_export/binary/binary_parser.hpp index f36d0133db..d1356f69de 100644 --- a/src/lib/import_export/binary/binary_parser.hpp +++ b/src/lib/import_export/binary/binary_parser.hpp @@ -16,6 +16,7 @@ #include "storage/run_length_segment.hpp" #include "storage/table.hpp" #include "storage/value_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" #include "storage/vector_compression/bitpacking/bitpacking_vector_type.hpp" namespace hyrise { @@ -76,6 +77,10 @@ class BinaryParser { template static std::shared_ptr> _import_dictionary_segment(std::ifstream& file, ChunkOffset row_count); + template + static std::shared_ptr> _import_variable_string_length_segment( + std::ifstream& file, ChunkOffset row_count); + static std::shared_ptr> _import_fixed_string_dictionary_segment( std::ifstream& file, ChunkOffset row_count); diff --git a/src/lib/import_export/binary/binary_writer.cpp b/src/lib/import_export/binary/binary_writer.cpp index 5c414accab..f618d4d018 100644 --- a/src/lib/import_export/binary/binary_writer.cpp +++ b/src/lib/import_export/binary/binary_writer.cpp @@ -23,6 +23,7 @@ #include "storage/segment_iterate.hpp" #include "storage/table.hpp" #include "storage/value_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" #include "storage/vector_compression/bitpacking/bitpacking_vector.hpp" #include "storage/vector_compression/bitpacking/bitpacking_vector_type.hpp" #include "storage/vector_compression/compressed_vector_type.hpp" @@ -350,6 +351,26 @@ void BinaryWriter::_write_segment(const LZ4Segment& lz4_segment, bool /*colum } } +template +void BinaryWriter::_write_segment(const VariableStringDictionarySegment& dictionary_segment, + bool /*column_is_nullable*/, std::ofstream& ofstream) { + export_value(ofstream, EncodingType::VariableStringDictionary); + + // Write attribute vector compression type and data. + const auto compressed_vector_type_id = _compressed_vector_type_id(dictionary_segment); + export_value(ofstream, compressed_vector_type_id); + _export_compressed_vector(ofstream, *dictionary_segment.compressed_vector_type(), + *dictionary_segment.attribute_vector()); + + // Write offset vector. + export_value(ofstream, static_cast(dictionary_segment.offset_vector()->size())); + export_values(ofstream, *dictionary_segment.offset_vector()); + + // Write the dictionary size and dictionary + export_value(ofstream, static_cast(dictionary_segment.dictionary()->size())); + export_values(ofstream, *dictionary_segment.dictionary()); +} + template CompressedVectorTypeID BinaryWriter::_compressed_vector_type_id( const AbstractEncodedSegment& abstract_encoded_segment) { diff --git a/src/lib/import_export/binary/binary_writer.hpp b/src/lib/import_export/binary/binary_writer.hpp index 9433fd54de..47159e1fcb 100644 --- a/src/lib/import_export/binary/binary_writer.hpp +++ b/src/lib/import_export/binary/binary_writer.hpp @@ -11,6 +11,7 @@ #include "storage/reference_segment.hpp" #include "storage/run_length_segment.hpp" #include "storage/value_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" namespace hyrise { @@ -222,6 +223,10 @@ class BinaryWriter { template static void _write_segment(const LZ4Segment& lz4_segment, bool /*column_is_nullable*/, std::ofstream& ofstream); + template + static void _write_segment(const VariableStringDictionarySegment& dictionary_segment, bool /*column_is_nullable*/, + std::ofstream& ofstream); + template static CompressedVectorTypeID _compressed_vector_type_id(const AbstractEncodedSegment& abstract_encoded_segment); diff --git a/src/lib/operators/aggregate_hash.cpp b/src/lib/operators/aggregate_hash.cpp index a4de220599..e2b7388147 100644 --- a/src/lib/operators/aggregate_hash.cpp +++ b/src/lib/operators/aggregate_hash.cpp @@ -15,6 +15,7 @@ #include #include +#include #include "aggregate/window_function_traits.hpp" #include "all_type_variant.hpp" diff --git a/src/lib/operators/print.cpp b/src/lib/operators/print.cpp index dafc118f87..ccf5d7779d 100644 --- a/src/lib/operators/print.cpp +++ b/src/lib/operators/print.cpp @@ -249,6 +249,10 @@ std::string Print::_segment_type(const std::shared_ptr& segment segment_type += "FSD"; break; } + case EncodingType::VariableStringDictionary: { + segment_type += "VSD"; + break; + } case EncodingType::FrameOfReference: { segment_type += "FoR"; break; diff --git a/src/lib/operators/table_scan/column_like_table_scan_impl.cpp b/src/lib/operators/table_scan/column_like_table_scan_impl.cpp index e8df432bae..eaae6af31e 100644 --- a/src/lib/operators/table_scan/column_like_table_scan_impl.cpp +++ b/src/lib/operators/table_scan/column_like_table_scan_impl.cpp @@ -15,6 +15,10 @@ #include "storage/pos_lists/row_id_pos_list.hpp" #include "storage/segment_iterables/create_iterable_from_attribute_vector.hpp" #include "storage/segment_iterate.hpp" +// NOLINTBEGIN(misc-include-cleaner): VariableStringDictionary is accessed in _find_matches_in_dictionary. +#include "storage/variable_string_dictionary/variable_string_vector.hpp" +#include "storage/variable_string_dictionary/variable_string_vector_iterator.hpp" +// NOLINTEND(misc-include-cleaner) #include "types.hpp" #include "utils/assert.hpp" @@ -49,8 +53,8 @@ void ColumnLikeTableScanImpl::_scan_generic_segment( const AbstractSegment& segment, const ChunkID chunk_id, RowIDPosList& matches, const std::shared_ptr& position_filter) const { segment_with_iterators_filtered(segment, position_filter, [&](auto iter, [[maybe_unused]] const auto end) { - // Don't instantiate this for ReferenceSegments to save compile time as ReferenceSegments are handled - // via position_filter + // Do not instantiate this for ReferenceSegments to save compile time as ReferenceSegments are handled via + // position_filter. if constexpr (!is_reference_segment_iterable_v) { using ColumnDataType = typename decltype(iter)::ValueType; @@ -62,10 +66,10 @@ void ColumnLikeTableScanImpl::_scan_generic_segment( _scan_with_iterators(functor, iter, end, chunk_id, matches); }); } else { - Fail("Can only handle strings"); + Fail("Can only handle strings."); } } else { - Fail("ReferenceSegments have their own code paths and should be handled there"); + Fail("ReferenceSegments have their own code paths and should be handled there."); } }); } @@ -76,14 +80,27 @@ void ColumnLikeTableScanImpl::_scan_dictionary_segment(const BaseDictionarySegme // First, build a bitmap containing 1s/0s for matching/non-matching dictionary values. Second, iterate over the // attribute vector and check against the bitmap. If too many input rows have already been removed (are not part of // position_filter), this optimization is detrimental. See caller for that case. - std::pair> result; + auto result = std::pair>{}; - if (segment.encoding_type() == EncodingType::Dictionary) { - const auto& typed_segment = static_cast&>(segment); - result = _find_matches_in_dictionary(*typed_segment.dictionary()); - } else { - const auto& typed_segment = static_cast&>(segment); - result = _find_matches_in_dictionary(*typed_segment.fixed_string_dictionary()); + switch (segment.encoding_type()) { + case EncodingType::Dictionary: { + const auto& typed_segment = static_cast&>(segment); + result = _find_matches_in_dictionary(*typed_segment.dictionary()); + break; + } + case EncodingType::FixedStringDictionary: { + const auto& typed_segment = static_cast&>(segment); + result = _find_matches_in_dictionary(*typed_segment.fixed_string_dictionary()); + break; + } + case EncodingType::VariableStringDictionary: { + const auto& typed_segment = static_cast&>(segment); + result = _find_matches_in_dictionary(*typed_segment.variable_string_dictionary()); + break; + } + default: { + Fail("Segment is either not dictionary-encoded or encoding specialization is not implemented."); + } } const auto& match_count = result.first; @@ -103,8 +120,8 @@ void ColumnLikeTableScanImpl::_scan_dictionary_segment(const BaseDictionarySegme return; } - // LIKE matches no rows - if (match_count == 0u) { + // LIKE matches no rows. + if (match_count == 0) { ++num_chunks_with_early_out; return; } @@ -125,21 +142,23 @@ std::pair> ColumnLikeTableScanImpl::_find_matches_in_d auto& count = result.first; auto& dictionary_matches = result.second; - count = 0u; - dictionary_matches.reserve(dictionary.size()); + count = 0; + dictionary_matches.resize(dictionary.size()); _matcher.resolve(_invert_results, [&](const auto& matcher) { #ifdef __clang__ -// For the loop through the dictionary, we want to use const auto& for DictionarySegments. However, -// FixedStringVector iterators return an std::string_view value. Thus, we disable clang's -Wrange-loop-analysis -// error about a potential copy for the loop value. +// For the loop through the dictionary, we want to use const auto& for DictionarySegments. However, FixedStringVector +// iterators return an std::string_view value. Thus, we disable clang's -Wrange-loop-analysis error about a potential +// copy for the loop value. #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wrange-loop-analysis" #endif + auto index = size_t{0}; for (const auto& value : dictionary) { - const auto matches = matcher(value); - count += static_cast(matches); - dictionary_matches.push_back(matches); + const auto match_result = matcher(value); + count += static_cast(match_result); + dictionary_matches[index] = match_result; + ++index; } #ifdef __clang__ diff --git a/src/lib/operators/table_scan/column_like_table_scan_impl.hpp b/src/lib/operators/table_scan/column_like_table_scan_impl.hpp index 87fd92e1a2..36b24c37e8 100644 --- a/src/lib/operators/table_scan/column_like_table_scan_impl.hpp +++ b/src/lib/operators/table_scan/column_like_table_scan_impl.hpp @@ -7,8 +7,6 @@ #include #include -#include - #include "abstract_dereferenced_column_table_scan_impl.hpp" #include "expression/evaluation/like_matcher.hpp" #include "types.hpp" diff --git a/src/lib/storage/create_iterable_from_segment.hpp b/src/lib/storage/create_iterable_from_segment.hpp index ca592b3ef5..58c1c4cb10 100644 --- a/src/lib/storage/create_iterable_from_segment.hpp +++ b/src/lib/storage/create_iterable_from_segment.hpp @@ -26,20 +26,22 @@ class ReferenceSegment; template class ReferenceSegmentIterable; +template + requires(std::is_same_v) +class VariableStringDictionarySegment; + /** * @defgroup Uniform interface to create an iterable from a segment * - * These methods cannot be part of the segments' interfaces because - * reference segment are not templated and thus don’t know their type. + * These methods cannot be part of the segments' interfaces because reference segment are not templated and thus do not + * know their type. * - * All iterables implement the same interface using static polymorphism - * (i.e. the CRTP pattern, see segment_iterables/.hpp). + * All iterables implement the same interface using static polymorphism (i.e. the CRTP pattern, see + * segment_iterables/.hpp). * - * In debug mode, create_iterable_from_segment returns a type erased - * iterable, i.e., all iterators have the same type + * In debug mode, create_iterable_from_segment returns a type erased iterable, i.e., all iterators have the same type. * - * Functions must be forward-declared because otherwise, we run into - * circular include dependencies. + * Functions must be forward-declared because otherwise, we run into circular include dependencies. * * @{ */ @@ -73,10 +75,13 @@ template auto create_iterable_from_segment(const ReferenceSegment& segment); +template +auto create_iterable_from_segment(const VariableStringDictionarySegment& segment); + /**@}*/ } // namespace hyrise -// Include these only now to break up include dependencies +// Include these only now to break up include dependencies. #include "create_iterable_from_reference_segment.ipp" #include "create_iterable_from_segment.ipp" diff --git a/src/lib/storage/create_iterable_from_segment.ipp b/src/lib/storage/create_iterable_from_segment.ipp index b05e909721..46b56e72d1 100644 --- a/src/lib/storage/create_iterable_from_segment.ipp +++ b/src/lib/storage/create_iterable_from_segment.ipp @@ -6,6 +6,8 @@ #include "storage/run_length_segment/run_length_segment_iterable.hpp" #include "storage/segment_iterables/any_segment_iterable.hpp" #include "storage/value_segment/value_segment_iterable.hpp" +#include "storage/variable_string_dictionary/variable_string_dictionary_iterable.hpp" +#include "storage/variable_string_dictionary_segment.hpp" namespace hyrise { @@ -21,7 +23,7 @@ auto create_iterable_from_segment(const ValueSegment& segment) { template auto create_iterable_from_segment(const DictionarySegment& segment) { #ifdef HYRISE_ERASE_DICTIONARY - PerformanceWarning("DictionarySegmentIterable erased by compile-time setting"); + PerformanceWarning("DictionarySegmentIterable erased by compile-time setting."); return AnySegmentIterable(DictionarySegmentIterable>(segment)); #else if constexpr (EraseSegmentType) { @@ -35,7 +37,7 @@ auto create_iterable_from_segment(const DictionarySegment& segment) { template auto create_iterable_from_segment(const RunLengthSegment& segment) { #ifdef HYRISE_ERASE_RUNLENGTH - PerformanceWarning("RunLengthSegmentIterable erased by compile-time setting"); + PerformanceWarning("RunLengthSegmentIterable erased by compile-time setting."); return AnySegmentIterable(RunLengthSegmentIterable(segment)); #else if constexpr (EraseSegmentType) { @@ -49,7 +51,7 @@ auto create_iterable_from_segment(const RunLengthSegment& segment) { template auto create_iterable_from_segment(const FixedStringDictionarySegment& segment) { #ifdef HYRISE_ERASE_FIXEDSTRINGDICTIONARY - PerformanceWarning("FixedStringDictionarySegmentIterable erased by compile-time setting"); + PerformanceWarning("FixedStringDictionarySegmentIterable erased by compile-time setting."); return AnySegmentIterable(DictionarySegmentIterable(segment)); #else if constexpr (EraseSegmentType) { @@ -63,7 +65,7 @@ auto create_iterable_from_segment(const FixedStringDictionarySegment& segment template auto create_iterable_from_segment(const FrameOfReferenceSegment& segment) { #ifdef HYRISE_ERASE_FRAMEOFREFERENCE - PerformanceWarning("FrameOfReferenceSegmentIterable erased by compile-time setting"); + PerformanceWarning("FrameOfReferenceSegmentIterable erased by compile-time setting."); return AnySegmentIterable(FrameOfReferenceSegmentIterable(segment)); #else if constexpr (EraseSegmentType) { @@ -81,4 +83,17 @@ auto create_iterable_from_segment(const LZ4Segment& segment) { return AnySegmentIterable(LZ4SegmentIterable(segment)); } +template +auto create_iterable_from_segment(const VariableStringDictionarySegment& segment) { +#ifdef HYRISE_ERASE_VARIABLESTRINGDICTIONARY + PerformanceWarning("VariableStringDictionarySegmentIterable erased by compile-time setting."); + return AnySegmentIterable(DictionarySegmentIterable(segment)); +#else + if constexpr (EraseSegmentType) { + return create_any_segment_iterable(segment); + } else { + return VariableStringDictionarySegmentIterable{segment}; + } +#endif +} } // namespace hyrise diff --git a/src/lib/storage/dictionary_segment.hpp b/src/lib/storage/dictionary_segment.hpp index f7bd7468dc..6a7787fb28 100644 --- a/src/lib/storage/dictionary_segment.hpp +++ b/src/lib/storage/dictionary_segment.hpp @@ -12,7 +12,7 @@ namespace hyrise { class BaseCompressedVector; /** - * @brief Segment implementing dictionary encoding + * @brief Segment implementing dictionary encoding. * * Uses vector compression schemes for its attribute vector. */ diff --git a/src/lib/storage/dictionary_segment/dictionary_segment_iterable.hpp b/src/lib/storage/dictionary_segment/dictionary_segment_iterable.hpp index c2b084b92c..e72e74e4cf 100644 --- a/src/lib/storage/dictionary_segment/dictionary_segment_iterable.hpp +++ b/src/lib/storage/dictionary_segment/dictionary_segment_iterable.hpp @@ -8,6 +8,7 @@ #include "storage/dictionary_segment.hpp" #include "storage/fixed_string_dictionary_segment.hpp" #include "storage/segment_iterables.hpp" +#include "storage/variable_string_dictionary_segment.hpp" #include "storage/vector_compression/resolve_compressed_vector_type.hpp" namespace hyrise { @@ -23,6 +24,9 @@ class DictionarySegmentIterable : public PointAccessibleSegmentIterable& segment) : _segment{segment}, _dictionary(segment.fixed_string_dictionary()) {} + explicit DictionarySegmentIterable(const VariableStringDictionarySegment& segment) + : _segment{segment}, _dictionary(segment.dictionary()) {} + template void _on_with_iterators(const Functor& functor) const { _segment.access_counter[SegmentAccessCounter::AccessType::Sequential] += _segment.size(); diff --git a/src/lib/storage/encoding_type.hpp b/src/lib/storage/encoding_type.hpp index f674447798..a96b68782b 100644 --- a/src/lib/storage/encoding_type.hpp +++ b/src/lib/storage/encoding_type.hpp @@ -23,7 +23,15 @@ namespace hyrise { namespace hana = boost::hana; -enum class EncodingType : uint8_t { Unencoded, Dictionary, RunLength, FixedStringDictionary, FrameOfReference, LZ4 }; +enum class EncodingType : uint8_t { + Unencoded, + Dictionary, + RunLength, + FixedStringDictionary, + FrameOfReference, + LZ4, + VariableStringDictionary +}; std::ostream& operator<<(std::ostream& stream, const EncodingType encoding_type); @@ -40,6 +48,7 @@ constexpr auto supported_data_types_for_encoding_type = hana::make_map( hana::make_pair(enum_c, data_types), hana::make_pair(enum_c, data_types), hana::make_pair(enum_c, hana::tuple_t), + hana::make_pair(enum_c, hana::tuple_t), hana::make_pair(enum_c, hana::tuple_t), hana::make_pair(enum_c, data_types)); diff --git a/src/lib/storage/reference_segment/reference_segment_iterable.hpp b/src/lib/storage/reference_segment/reference_segment_iterable.hpp index d56803fb7e..fd832ef329 100644 --- a/src/lib/storage/reference_segment/reference_segment_iterable.hpp +++ b/src/lib/storage/reference_segment/reference_segment_iterable.hpp @@ -72,6 +72,12 @@ class ReferenceSegmentIterable : public SegmentIterable>) { + return; + } +#endif + #ifdef HYRISE_ERASE_FRAMEOFREFERENCE if constexpr (std::is_same_v) { if constexpr (std::is_same_v>) { diff --git a/src/lib/storage/resolve_encoded_segment_type.hpp b/src/lib/storage/resolve_encoded_segment_type.hpp index a68d0c9d26..9d3ec21ecb 100644 --- a/src/lib/storage/resolve_encoded_segment_type.hpp +++ b/src/lib/storage/resolve_encoded_segment_type.hpp @@ -15,6 +15,7 @@ #include "storage/frame_of_reference_segment.hpp" #include "storage/lz4_segment.hpp" #include "storage/run_length_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" #include "utils/enum_constant.hpp" #include "utils/template_type.hpp" @@ -33,7 +34,9 @@ constexpr auto encoded_segment_for_type = hana::make_map( hana::make_pair(enum_c, template_c), hana::make_pair(enum_c, template_c), - hana::make_pair(enum_c, template_c)); + hana::make_pair(enum_c, template_c), + hana::make_pair(enum_c, + template_c)); // When adding something here, please also append all_segment_encoding_specs in the BaseTest class. @@ -53,22 +56,18 @@ void resolve_encoded_segment_type(const AbstractEncodedSegment& segment, const F constexpr auto encoding_type = hana::value(encoding_type_c); - // If the segment's encoding type matches that of the pair, we have found the segment's type + // If the segment's encoding type matches that of the pair, we have found the segment's type. if (!match_found && (encoding_type == segment.encoding_type())) { // Check if ColumnDataType is supported by encoding const auto data_type_supported = encoding_supports_data_type(encoding_type_c, hana::type_c); - // clang-format off - // Compile only if ColumnDataType is supported - if constexpr(hana::value(data_type_supported)) { + if constexpr (hana::value(data_type_supported)) { using SegmentTemplateType = typename decltype(segment_template_c)::type; using SegmentType = typename SegmentTemplateType::template _template; functor(static_cast(segment)); } - // clang-format on - return true; } diff --git a/src/lib/storage/segment_encoding_utils.cpp b/src/lib/storage/segment_encoding_utils.cpp index 62f3ad70bf..ae7abec456 100644 --- a/src/lib/storage/segment_encoding_utils.cpp +++ b/src/lib/storage/segment_encoding_utils.cpp @@ -11,6 +11,7 @@ #include "storage/lz4_segment/lz4_encoder.hpp" #include "storage/reference_segment.hpp" #include "storage/run_length_segment/run_length_encoder.hpp" +#include "storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp" #include "storage/vector_compression/compressed_vector_type.hpp" #include "storage/vector_compression/vector_compression.hpp" #include "utils/assert.hpp" @@ -29,12 +30,13 @@ const auto encoder_for_type = std::map()}, {EncodingType::FixedStringDictionary, std::make_shared>()}, {EncodingType::FrameOfReference, std::make_shared()}, - {EncodingType::LZ4, std::make_shared()}}; + {EncodingType::LZ4, std::make_shared()}, + {EncodingType::VariableStringDictionary, std::make_shared()}}; } // namespace std::unique_ptr create_encoder(EncodingType encoding_type) { - Assert(encoding_type != EncodingType::Unencoded, "Encoding type must not be Unencoded`."); + Assert(encoding_type != EncodingType::Unencoded, "Encoding type must not be Unencoded."); auto iter = encoder_for_type.find(encoding_type); Assert(iter != encoder_for_type.cend(), "All encoding types must be in encoder_for_type."); @@ -58,7 +60,7 @@ SegmentEncodingSpec get_segment_encoding_spec(const std::shared_ptrencoding_type(), vector_compression}; } - Fail("Unexpected segment encoding found."); + Fail("Unexpected segment encoding."); } VectorCompressionType parent_vector_compression_type(const CompressedVectorType compressed_vector_type) { @@ -71,7 +73,7 @@ VectorCompressionType parent_vector_compression_type(const CompressedVectorType case CompressedVectorType::BitPacking: return VectorCompressionType::BitPacking; } - Fail("Invalid enum value."); + Fail("Invalid compressed vector type."); } } // namespace hyrise diff --git a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp new file mode 100644 index 0000000000..f0e774141d --- /dev/null +++ b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_encoder.hpp @@ -0,0 +1,132 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +#include "storage/base_segment_encoder.hpp" +#include "storage/dictionary_segment.hpp" +#include "storage/fixed_string_dictionary_segment.hpp" +#include "storage/segment_iterables/any_segment_iterable.hpp" +#include "storage/value_segment.hpp" +#include "storage/variable_string_dictionary_segment.hpp" +#include "storage/vector_compression/base_compressed_vector.hpp" +#include "storage/vector_compression/vector_compression.hpp" +#include "types.hpp" +#include "utils/enum_constant.hpp" + +namespace hyrise { + +/** + * @brief Encodes a segment using variable string dictionary encoding and compresses its attribute vector using vector compression. + * + * The algorithm first creates an attribute vector of standard size (uint32_t) and then compresses it + * using fixed-width integer encoding. + */ +class VariableStringDictionaryEncoder : public SegmentEncoder { + public: + static constexpr auto _encoding_type = enum_c; + static constexpr auto _uses_vector_compression = true; // see base_segment_encoder.hpp for details + + std::shared_ptr _on_encode(const AnySegmentIterable segment_iterable, + const PolymorphicAllocator& allocator) { + // Vectors to gather the input segment's data. This data is used in a later step to + // construct the actual dictionary and attribute vector. + auto dense_values = std::vector(); // Contains the actual values (no NULLs). + auto null_values = std::vector(); // bitmap to mark NULL values + // Maps string to ChunkOffsets for faster write of vector that maps ChunkOffsets to ValueID. + auto string_to_chunk_offsets = boost::unordered_flat_map>{}; + auto segment_size = uint32_t{0}; + + // Iterate over segment, save all values and save to values the chunk_offset. + segment_iterable.with_iterators([&](auto segment_it, const auto segment_end) { + segment_size = std::distance(segment_it, segment_end); + dense_values.reserve(segment_size); // Potentially overallocate for segments with NULLs. + null_values.resize(segment_size); // Resized to size of segment. + + for (auto current_position = size_t{0}; segment_it != segment_end; ++segment_it, ++current_position) { + const auto segment_item = *segment_it; + if (!segment_item.is_null()) { + const auto& segment_value = segment_item.value(); + // Only insert unique values to avoid a call to std::unique later. + if (!string_to_chunk_offsets.contains(segment_value)) { + dense_values.push_back(segment_value); + } + string_to_chunk_offsets[segment_value].push_back(ChunkOffset(current_position)); + } else { + null_values[current_position] = true; + } + } + }); + + // Eliminate duplicate strings. + std::sort(dense_values.begin(), dense_values.end()); + dense_values.shrink_to_fit(); + + // Compute total compressed data size. + const auto total_size = + std::accumulate(dense_values.begin(), dense_values.end(), size_t{0}, [](size_t acc, pmr_string& value) { + return acc + value.size() + 1; + }); + + // Check for oversize dictionary. + Assert(total_size < std::numeric_limits::max(), "Dictionary is too large!"); + + // uniform character array containing all distinct strings + auto clob = std::make_shared>(pmr_vector(total_size)); + // We assume segment size up to 4 GByte. + // Maps string to offset in clob. + auto string_offsets = boost::unordered_flat_map{}; + // Maps string to ValueID for attribute vector. + auto string_value_ids = boost::unordered_flat_map{}; + auto last_offset = uint32_t{0}; + auto last_value_id = ValueID{0}; + + // Construct clob with null bytes (therefore some + 1 around the code). + for (const auto& value : dense_values) { + memcpy(clob->data() + last_offset, value.c_str(), value.size() + 1); + string_offsets[value] = static_cast(last_offset); + string_value_ids[value] = last_value_id++; + last_offset += value.size() + 1; + } + + // Maps ChunkOffset to ValueID. + auto chunk_offset_to_value_id = pmr_vector(static_cast(segment_size)); + auto offset_vector = std::make_shared>(pmr_vector(dense_values.size())); + offset_vector->shrink_to_fit(); + + // Construct attribute and offset vector. + for (const auto& [string, chunk_offsets] : string_to_chunk_offsets) { + const auto offset = string_offsets[string]; + const auto value_id = string_value_ids[string]; + (*offset_vector)[value_id] = offset; + for (const auto chunk_offset : chunk_offsets) { + chunk_offset_to_value_id[chunk_offset] = value_id; + } + } + + // Fix up null values (mapping only maps actual data, no null values). + const auto null_value_id = dense_values.size(); + for (auto offset = ChunkOffset{0}; offset < segment_size; ++offset) { + const auto is_null = null_values[offset]; + if (is_null) { + chunk_offset_to_value_id[offset] = null_value_id; + } + } + + // last_value_id corresponds to the maximal value id of the segment. + const auto compressed_chunk_offset_to_value_id = std::shared_ptr(compress_vector( + chunk_offset_to_value_id, SegmentEncoder::vector_compression_type(), allocator, + {last_value_id})); + + return std::make_shared>(clob, compressed_chunk_offset_to_value_id, + offset_vector); + } +}; + +} // namespace hyrise diff --git a/src/lib/storage/variable_string_dictionary/variable_string_dictionary_iterable.hpp b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_iterable.hpp new file mode 100644 index 0000000000..59c532f993 --- /dev/null +++ b/src/lib/storage/variable_string_dictionary/variable_string_dictionary_iterable.hpp @@ -0,0 +1,203 @@ +#pragma once + +#include +#include +#include + +#include "storage/abstract_segment.hpp" +#include "storage/fixed_string_dictionary_segment.hpp" +#include "storage/segment_iterables.hpp" +#include "storage/variable_string_dictionary_segment.hpp" +#include "storage/vector_compression/resolve_compressed_vector_type.hpp" + +namespace hyrise { + +template +class VariableStringDictionarySegmentIterable + : public PointAccessibleSegmentIterable> { + public: + using ValueType = T; + using Dictionary = pmr_vector; + + explicit VariableStringDictionarySegmentIterable(const VariableStringDictionarySegment& segment) + : _segment{segment}, _dictionary(segment.dictionary()) {} + + template + void _on_with_iterators(const Functor& functor) const { + _segment.access_counter[SegmentAccessCounter::AccessType::Sequential] += _segment.size(); + _segment.access_counter[SegmentAccessCounter::AccessType::Dictionary] += _segment.size(); + + resolve_compressed_vector_type(*_segment.attribute_vector(), [this, &functor](const auto& vector) { + using CompressedVectorIterator = decltype(vector.cbegin()); + using DictionaryIteratorType = decltype(_dictionary->cbegin()); + + const auto& offset_vector = _segment.offset_vector(); + + auto begin = + Iterator{_dictionary->cbegin(), _segment.null_value_id(), + vector.cbegin(), ChunkOffset{0u}, + offset_vector, _dictionary}; + auto end = Iterator{ + _dictionary->cbegin(), _segment.null_value_id(), + vector.cend(), static_cast(_segment.size()), + offset_vector, _dictionary}; + + functor(begin, end); + }); + } + + template + void _on_with_iterators(const std::shared_ptr& position_filter, const Functor& functor) const { + _segment.access_counter[SegmentAccessCounter::access_type(*position_filter)] += position_filter->size(); + _segment.access_counter[SegmentAccessCounter::AccessType::Dictionary] += position_filter->size(); + + resolve_compressed_vector_type(*_segment.attribute_vector(), [this, &functor, + &position_filter](const auto& vector) { + using Decompressor = std::decay_t; + using DictionaryIteratorType = decltype(_dictionary->cbegin()); + + using PosListIteratorType = decltype(position_filter->cbegin()); + auto begin = + PointAccessIterator{_dictionary->cbegin(), + _segment.null_value_id(), + vector.create_decompressor(), + position_filter->cbegin(), + position_filter->cbegin(), + _segment.offset_vector(), + _dictionary}; + auto end = + PointAccessIterator{_dictionary->cbegin(), + _segment.null_value_id(), + vector.create_decompressor(), + position_filter->cbegin(), + position_filter->cend(), + _segment.offset_vector(), + _dictionary}; + functor(begin, end); + }); + } + + size_t _on_size() const { + return _segment.size(); + } + + private: + template + class Iterator + : public AbstractSegmentIterator, SegmentPosition> { + public: + using ValueType = T; + using IterableType = VariableStringDictionarySegmentIterable; + + Iterator(DictionaryIteratorType dictionary_begin_it, ValueID null_value_id, CompressedVectorIterator attribute_it, + ChunkOffset chunk_offset, const std::shared_ptr>& offset_vector, + const std::shared_ptr>& dictionary) + : _dictionary_begin_it{std::move(dictionary_begin_it)}, + _null_value_id{null_value_id}, + _attribute_it{std::move(attribute_it)}, + _chunk_offset{chunk_offset}, + _offset_vector{offset_vector}, + _dictionary{dictionary} {} + + private: + friend class boost::iterator_core_access; // grants the boost::iterator_facade access to the private interface + + void increment() { + ++_attribute_it; + ++_chunk_offset; + } + + void decrement() { + --_attribute_it; + --_chunk_offset; + } + + void advance(std::ptrdiff_t n) { + _attribute_it += n; + _chunk_offset += n; + } + + bool equal(const Iterator& other) const { + return _attribute_it == other._attribute_it; + } + + std::ptrdiff_t distance_to(const Iterator& other) const { + return other._attribute_it - _attribute_it; + } + + SegmentPosition dereference() const { + const auto value_id = static_cast(*_attribute_it); + const auto is_null = (value_id == _null_value_id); + + if (is_null) { + return SegmentPosition{T{}, true, _chunk_offset}; + } + + return SegmentPosition{ + T{VariableStringDictionarySegment::get_string(*_offset_vector, *_dictionary, value_id)}, false, + _chunk_offset}; + } + + private: + DictionaryIteratorType _dictionary_begin_it; + ValueID _null_value_id; + CompressedVectorIterator _attribute_it; + ChunkOffset _chunk_offset; + std::shared_ptr> _offset_vector; + std::shared_ptr _dictionary; + }; + + template + class PointAccessIterator : public AbstractPointAccessSegmentIterator< + PointAccessIterator, + SegmentPosition, PosListIteratorType> { + public: + using ValueType = T; + using IterableType = VariableStringDictionarySegmentIterable; + + PointAccessIterator(DictionaryIteratorType dictionary_begin_it, const ValueID null_value_id, + Decompressor attribute_decompressor, PosListIteratorType position_filter_begin, + PosListIteratorType position_filter_it, + const std::shared_ptr>& offset_vector, + const std::shared_ptr>& dictionary) + : AbstractPointAccessSegmentIterator< + PointAccessIterator, SegmentPosition, + PosListIteratorType>{std::move(position_filter_begin), std::move(position_filter_it)}, + _dictionary_begin_it{std::move(dictionary_begin_it)}, + _null_value_id{null_value_id}, + _attribute_decompressor{std::move(attribute_decompressor)}, + _offset_vector{offset_vector}, + _dictionary{dictionary} {} + + private: + friend class boost::iterator_core_access; // grants the boost::iterator_facade access to the private interface + + SegmentPosition dereference() const { + const auto& chunk_offsets = this->chunk_offsets(); + + const auto value_id = _attribute_decompressor.get(chunk_offsets.offset_in_referenced_chunk); + const auto is_null = (value_id == _null_value_id); + + if (is_null) { + return SegmentPosition{T{}, true, chunk_offsets.offset_in_poslist}; + } + + return SegmentPosition{ + T{VariableStringDictionarySegment::get_string(*_offset_vector, *_dictionary, ValueID{value_id})}, false, + chunk_offsets.offset_in_poslist}; + } + + private: + DictionaryIteratorType _dictionary_begin_it; + ValueID _null_value_id; + mutable Decompressor _attribute_decompressor; + std::shared_ptr> _offset_vector; + std::shared_ptr _dictionary; + }; + + private: + const VariableStringDictionarySegment& _segment; + std::shared_ptr _dictionary; +}; + +} // namespace hyrise diff --git a/src/lib/storage/variable_string_dictionary/variable_string_vector.cpp b/src/lib/storage/variable_string_dictionary/variable_string_vector.cpp new file mode 100644 index 0000000000..a353659887 --- /dev/null +++ b/src/lib/storage/variable_string_dictionary/variable_string_vector.cpp @@ -0,0 +1,35 @@ +#include "variable_string_vector.hpp" + +#include +#include +#include + +#include "storage/variable_string_dictionary/variable_string_vector_iterator.hpp" +#include "types.hpp" + +namespace hyrise { + +hyrise::VariableStringVector::VariableStringVector(const std::shared_ptr>& dictionary, + const std::shared_ptr>& offset_vector) + : _dictionary{dictionary}, _offset_vector{offset_vector} {} + +VariableStringVectorIterator VariableStringVector::begin() const noexcept { + return VariableStringVectorIterator(_dictionary, _offset_vector, ValueID{0}); +} + +VariableStringVectorIterator VariableStringVector::end() const noexcept { + return VariableStringVectorIterator(_dictionary, _offset_vector, ValueID(size())); +} + +VariableStringVectorIterator VariableStringVector::cbegin() const noexcept { + return begin(); +} + +VariableStringVectorIterator VariableStringVector::cend() const noexcept { + return end(); +} + +size_t VariableStringVector::size() const { + return _offset_vector->size(); +} +} // namespace hyrise diff --git a/src/lib/storage/variable_string_dictionary/variable_string_vector.hpp b/src/lib/storage/variable_string_dictionary/variable_string_vector.hpp new file mode 100644 index 0000000000..aed703c446 --- /dev/null +++ b/src/lib/storage/variable_string_dictionary/variable_string_vector.hpp @@ -0,0 +1,31 @@ +#pragma once + +#include +#include + +#include "storage/variable_string_dictionary_segment.hpp" +#include "types.hpp" + +namespace hyrise { + +class VariableStringVectorIterator; + +class VariableStringVector { + public: + explicit VariableStringVector(const std::shared_ptr>& dictionary, + const std::shared_ptr>& offset_vector); + + VariableStringVectorIterator begin() const noexcept; + VariableStringVectorIterator end() const noexcept; + + VariableStringVectorIterator cbegin() const noexcept; + VariableStringVectorIterator cend() const noexcept; + + size_t size() const; + + protected: + std::shared_ptr> _dictionary; + std::shared_ptr> _offset_vector; +}; + +} // namespace hyrise diff --git a/src/lib/storage/variable_string_dictionary/variable_string_vector_iterator.hpp b/src/lib/storage/variable_string_dictionary/variable_string_vector_iterator.hpp new file mode 100644 index 0000000000..37e2fa94ac --- /dev/null +++ b/src/lib/storage/variable_string_dictionary/variable_string_vector_iterator.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include + +#include "storage/variable_string_dictionary_segment.hpp" + +namespace hyrise { + +using DereferenceValue = std::string_view; + +class VariableStringVectorIterator : public boost::iterator_facade { + public: + explicit VariableStringVectorIterator(const std::shared_ptr>& dictionary, + const std::shared_ptr>& offset_vector, + ValueID current_value_id) + : _dictionary{dictionary}, _offset_vector{offset_vector}, _current_value_id{current_value_id} {} + + protected: + friend class boost::iterator_core_access; + + // We have a couple of NOLINTs here becaues the facade expects these method names: + + bool equal(VariableStringVectorIterator const& other) const { // NOLINT + return _dictionary == other._dictionary && _current_value_id == other._current_value_id; + } + + size_t distance_to(VariableStringVectorIterator const& other) const { // NOLINT + return static_cast(other._current_value_id) - static_cast(_current_value_id); + } + + void advance(size_t n) { // NOLINT + _current_value_id += n; + } + + void increment() { // NOLINT + ++_current_value_id; + } + + void decrement() { // NOLINT + --_current_value_id; + } + + const std::string_view dereference() const { // NOLINT + return VariableStringDictionarySegment::get_string(*_offset_vector, *_dictionary, _current_value_id); + } + + std::shared_ptr> _dictionary; + std::shared_ptr> _offset_vector; + ValueID _current_value_id; +}; + +} // namespace hyrise diff --git a/src/lib/storage/variable_string_dictionary_segment.cpp b/src/lib/storage/variable_string_dictionary_segment.cpp new file mode 100644 index 0000000000..3c79b72a02 --- /dev/null +++ b/src/lib/storage/variable_string_dictionary_segment.cpp @@ -0,0 +1,190 @@ +#include "variable_string_dictionary_segment.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "all_type_variant.hpp" +#include "resolve_type.hpp" +#include "storage/abstract_segment.hpp" +#include "storage/base_dictionary_segment.hpp" +#include "storage/encoding_type.hpp" +#include "storage/segment_access_counter.hpp" +#include "storage/variable_string_dictionary/variable_string_vector.hpp" +#include "storage/vector_compression/compressed_vector_type.hpp" +#include "types.hpp" +#include "utils/assert.hpp" +#include "utils/performance_warning.hpp" + +namespace hyrise { + +template + requires(std::is_same_v) +VariableStringDictionarySegment::VariableStringDictionarySegment( + const std::shared_ptr>& dictionary, + const std::shared_ptr& attribute_vector, + const std::shared_ptr>& offset_vector) + : BaseDictionarySegment(data_type_from_type()), + _dictionary{dictionary}, + _attribute_vector{attribute_vector}, + _decompressor{attribute_vector->create_base_decompressor()}, + _offset_vector{offset_vector} { + // NULL is represented by _offset_vector.size(). INVALID_VALUE_ID, which is the highest possible number in + // ValueID::base_type (2^32 - 1), is needed to represent "value not found" in calls to lower_bound/upper_bound. + // For a VariableStringDictionarySegment of the max size Chunk::MAX_SIZE, those two values overlap. + + Assert(_offset_vector->size() < std::numeric_limits::max(), + "Input segment too large to store as VariableStringDictionarySegment."); +} + +template + requires(std::is_same_v) +std::shared_ptr> VariableStringDictionarySegment::dictionary() const { + return _dictionary; +} + +template + requires(std::is_same_v) +std::shared_ptr VariableStringDictionarySegment::variable_string_dictionary() const { + return std::make_shared(dictionary(), _offset_vector); +} + +template + requires(std::is_same_v) +AllTypeVariant VariableStringDictionarySegment::operator[](const ChunkOffset chunk_offset) const { + PerformanceWarning("operator[] used."); + DebugAssert(chunk_offset != INVALID_CHUNK_OFFSET, "Passed chunk offset must be valid."); + + const auto value = get_typed_value(chunk_offset); + return value ? value.value() : NULL_VALUE; +} + +template + requires(std::is_same_v) +ChunkOffset VariableStringDictionarySegment::size() const { + return static_cast(_attribute_vector->size()); +} + +template + requires(std::is_same_v) +std::shared_ptr VariableStringDictionarySegment::copy_using_allocator( + const PolymorphicAllocator& alloc) const { + auto new_attribute_vector = _attribute_vector->copy_using_allocator(alloc); + auto new_dictionary = std::make_shared>(*_dictionary, alloc); + auto new_offset = std::make_shared>(*_offset_vector, alloc); + auto copy = std::make_shared(std::move(new_dictionary), + std::move(new_attribute_vector), std::move(new_offset)); + copy->access_counter = access_counter; + return copy; +} + +template + requires(std::is_same_v) +size_t VariableStringDictionarySegment::memory_usage(const MemoryUsageCalculationMode /*mode*/) const { + using OffsetVectorType = std::decay_tbegin())>; + return _attribute_vector->data_size() + _dictionary->capacity() + + _offset_vector->capacity() * sizeof(OffsetVectorType); +} + +template + requires(std::is_same_v) +std::optional VariableStringDictionarySegment::compressed_vector_type() const { + return _attribute_vector->type(); +} + +template + requires(std::is_same_v) +EncodingType VariableStringDictionarySegment::encoding_type() const { + return EncodingType::VariableStringDictionary; +} + +template + requires(std::is_same_v) +ValueID VariableStringDictionarySegment::lower_bound(const AllTypeVariant& value) const { + DebugAssert(!variant_is_null(value), "NULL value passed."); + access_counter[SegmentAccessCounter::AccessType::Dictionary] += + static_cast(std::ceil(std::log2(_offset_vector->size()))); + + const auto typed_value = boost::get(value); + + auto it = std::lower_bound(_offset_vector->begin(), _offset_vector->end(), typed_value, + [this](const auto& offset, const auto& to_find) { + const auto value = std::string_view{_dictionary->data() + offset}; + return value < to_find; + }); + if (it == _offset_vector->end()) { + return INVALID_VALUE_ID; + } + return ValueID{static_cast(std::distance(_offset_vector->begin(), it))}; +} + +template + requires(std::is_same_v) +ValueID VariableStringDictionarySegment::upper_bound(const AllTypeVariant& value) const { + DebugAssert(!variant_is_null(value), "NULL value passed."); + access_counter[SegmentAccessCounter::AccessType::Dictionary] += + static_cast(std::ceil(std::log2(_offset_vector->size()))); + + const auto typed_value = boost::get(value); + + auto it = std::upper_bound(_offset_vector->begin(), _offset_vector->end(), typed_value, + [this](const auto& to_find, const auto& offset) { + const auto value = std::string_view{_dictionary->data() + offset}; + return value > to_find; + }); + if (it == _offset_vector->end()) { + return INVALID_VALUE_ID; + } + return ValueID{static_cast(std::distance(_offset_vector->begin(), it))}; +} + +template + requires(std::is_same_v) +AllTypeVariant VariableStringDictionarySegment::value_of_value_id(const ValueID value_id) const { + // We do not increase SegmentAccessCounter in true case because we do not access the dictionary. + return value_id == null_value_id() ? NULL_VALUE : typed_value_of_value_id(value_id); +} + +template + requires(std::is_same_v) +pmr_string VariableStringDictionarySegment::typed_value_of_value_id(const ValueID value_id) const { + DebugAssert(value_id < _offset_vector->size(), "ValueID out of bounds."); + access_counter[SegmentAccessCounter::AccessType::Dictionary] += 1; + + return pmr_string{get_string(*_offset_vector, *_dictionary, value_id)}; +} + +template + requires(std::is_same_v) +ValueID::base_type VariableStringDictionarySegment::unique_values_count() const { + return _offset_vector->size(); +} + +template + requires(std::is_same_v) +std::shared_ptr VariableStringDictionarySegment::attribute_vector() const { + return _attribute_vector; +} + +template + requires(std::is_same_v) +ValueID VariableStringDictionarySegment::null_value_id() const { + return ValueID{static_cast(_offset_vector->size())}; +} + +template + + requires(std::is_same_v) +const std::shared_ptr>& VariableStringDictionarySegment::offset_vector() const { + return _offset_vector; +} + +template class VariableStringDictionarySegment; + +} // namespace hyrise diff --git a/src/lib/storage/variable_string_dictionary_segment.hpp b/src/lib/storage/variable_string_dictionary_segment.hpp new file mode 100644 index 0000000000..1d60699e0f --- /dev/null +++ b/src/lib/storage/variable_string_dictionary_segment.hpp @@ -0,0 +1,126 @@ +#pragma once + +#include +#include +#include +#include + +#include "base_dictionary_segment.hpp" +#include "storage/vector_compression/base_compressed_vector.hpp" +#include "types.hpp" + +namespace hyrise { + +class BaseCompressedVector; +class VariableStringVector; + +/** + * @brief Segment implementing variable length string encoding. + * + * Uses vector compression schemes for its attribute vector. + */ +template + + requires(std::is_same_v) +class VariableStringDictionarySegment : public BaseDictionarySegment { + public: + VariableStringDictionarySegment(const std::shared_ptr>& dictionary, + const std::shared_ptr& attribute_vector, + const std::shared_ptr>& offset_vector); + + // returns an underlying dictionary + std::shared_ptr> dictionary() const; + + std::shared_ptr variable_string_dictionary() const; + + /** + * @defgroup AbstractSegment interface + * @{ + */ + + AllTypeVariant operator[](const ChunkOffset chunk_offset) const final; + + std::optional get_typed_value(const ChunkOffset chunk_offset) const { + // performance critical - not in cpp to help with inlining + const auto value_id = _decompressor->get(chunk_offset); + if (value_id == null_value_id()) { + // We do not increase SegmentAccessCounter here because we do not access the dictionary. + return std::nullopt; + } + + // TODO(student): Does this impact performance? + return typed_value_of_value_id(ValueID{value_id}); + } + + ChunkOffset size() const final; + + std::shared_ptr copy_using_allocator(const PolymorphicAllocator& alloc) const final; + + size_t memory_usage(const MemoryUsageCalculationMode mode) const final; + /**@}*/ + + /** + * @defgroup AbstractEncodedSegment interface + * @{ + */ + std::optional compressed_vector_type() const final; + /**@}*/ + + /** + * @defgroup BaseDictionarySegment interface + * @{ + */ + EncodingType encoding_type() const final; + + // Returns the first offset ID that refers to a offset >= the search offset and INVALID_VALUE_ID if all values are + // smaller than the search offset. Here, INVALID_VALUE_ID does not represent NULL (which isn't stored in the + // dictionary anyway). Imagine a segment with values from 1 to 10. A scan for `WHERE a < 12` would retrieve + // `lower_bound(12) == INVALID_VALUE_ID` and compare all values in the attribute vector to `< INVALID_VALUE_ID`. + // Thus, returning INVALID_VALUE_ID makes comparisons much easier. However, the caller has to make sure that + // NULL values stored in the attribute vector (stored with a offset ID of unique_values_count()) are excluded. + // See #1471 for a deeper discussion. + ValueID lower_bound(const AllTypeVariant& value) const final; + + // Returns the first value ID that refers to a value > the search value and INVALID_VALUE_ID if all values are + // smaller than or equal to the search value (see also lower_bound). + ValueID upper_bound(const AllTypeVariant& value) const final; + + AllTypeVariant value_of_value_id(const ValueID value_id) const final; + + pmr_string typed_value_of_value_id(const ValueID value_id) const; + + ValueID::base_type unique_values_count() const final; + + std::shared_ptr attribute_vector() const final; + + ValueID null_value_id() const final; + + const std::shared_ptr>& offset_vector() const; + + static inline std::string_view get_string(const pmr_vector& offset_vector, + const pmr_vector& dictionary, ValueID value_id) { + const auto offset = offset_vector[value_id]; + auto next_offset = 0; + + if (value_id >= offset_vector.size() - 1) { + next_offset = dictionary.size(); + } else { + next_offset = offset_vector[value_id + 1]; + } + const auto string_length = next_offset - offset - 1; + + return std::string_view{dictionary.data() + offset, string_length}; + } + + protected: + const std::shared_ptr> _dictionary; + // Maps chunk offsets to value ids. + const std::shared_ptr _attribute_vector; + std::unique_ptr _decompressor; + // Maps value ids to dictionary offsets. + const std::shared_ptr> _offset_vector; +}; + +extern template class VariableStringDictionarySegment; + +} // namespace hyrise diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 50a304cb1e..44521bdba7 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -217,6 +217,7 @@ set( lib/storage/table_column_definition_test.cpp lib/storage/table_test.cpp lib/storage/value_segment_test.cpp + lib/storage/variable_string_dictionary_segment_test.cpp lib/tasks/chunk_compression_task_test.cpp lib/utils/atomic_max_test.cpp lib/utils/check_table_equal_test.cpp diff --git a/src/test/base_test.hpp b/src/test/base_test.hpp index 21272d0d5f..7f359183e1 100644 --- a/src/test/base_test.hpp +++ b/src/test/base_test.hpp @@ -106,7 +106,8 @@ const SegmentEncodingSpec all_segment_encoding_specs[]{ SegmentEncodingSpec{EncodingType::FixedStringDictionary, VectorCompressionType::BitPacking}, SegmentEncodingSpec{EncodingType::FrameOfReference}, SegmentEncodingSpec{EncodingType::LZ4}, - SegmentEncodingSpec{EncodingType::RunLength}}; + SegmentEncodingSpec{EncodingType::RunLength}, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}}; template inline auto enum_formatter = [](const ::testing::TestParamInfo& info) { diff --git a/src/test/lib/import_export/binary/binary_parser_test.cpp b/src/test/lib/import_export/binary/binary_parser_test.cpp index 4824ad7d7f..3dd51f2abc 100644 --- a/src/test/lib/import_export/binary/binary_parser_test.cpp +++ b/src/test/lib/import_export/binary/binary_parser_test.cpp @@ -313,6 +313,56 @@ TEST_F(BinaryParserTest, FixedStringDictionaryMultipleChunks) { EXPECT_TABLE_EQ_ORDERED(table, expected_table); } +TEST_F(BinaryParserTest, VariableStringDictionarySingleChunk) { + TableColumnDefinitions column_definitions; + column_definitions.emplace_back("a", DataType::String, false); + + auto expected_table = std::make_shared(column_definitions, TableType::Data, ChunkOffset{10}); + expected_table->append({"This"}); + expected_table->append({"is"}); + expected_table->append({"a"}); + expected_table->append({"test"}); + + auto table = BinaryParser::parse(_reference_filepath + + ::testing::UnitTest::GetInstance()->current_test_info()->name() + ".bin"); + + EXPECT_TABLE_EQ_ORDERED(table, expected_table); +} + +TEST_F(BinaryParserTest, VariableStringDictionaryNullValue) { + TableColumnDefinitions column_definitions; + column_definitions.emplace_back("a", DataType::String, true); + + auto expected_table = std::make_shared
(column_definitions, TableType::Data, ChunkOffset{10}); + expected_table->append({"This"}); + expected_table->append({"is"}); + expected_table->append({"a"}); + expected_table->append({NULL_VALUE}); + expected_table->append({"test"}); + expected_table->append({NULL_VALUE}); + + auto table = BinaryParser::parse(_reference_filepath + + ::testing::UnitTest::GetInstance()->current_test_info()->name() + ".bin"); + + EXPECT_TABLE_EQ_ORDERED(table, expected_table); +} + +TEST_F(BinaryParserTest, VariableStringDictionaryMultipleChunks) { + TableColumnDefinitions column_definitions; + column_definitions.emplace_back("a", DataType::String, false); + + auto expected_table = std::make_shared
(column_definitions, TableType::Data, ChunkOffset{3}); + expected_table->append({"This"}); + expected_table->append({"is"}); + expected_table->append({"a"}); + expected_table->append({"test"}); + + auto table = BinaryParser::parse(_reference_filepath + + ::testing::UnitTest::GetInstance()->current_test_info()->name() + ".bin"); + + EXPECT_TABLE_EQ_ORDERED(table, expected_table); +} + TEST_F(BinaryParserTest, NullValuesFrameOfReferenceSegment) { TableColumnDefinitions column_definitions; column_definitions.emplace_back("a", DataType::Int, true); diff --git a/src/test/lib/import_export/binary/binary_writer_test.cpp b/src/test/lib/import_export/binary/binary_writer_test.cpp index cb23bad243..c368a87b83 100644 --- a/src/test/lib/import_export/binary/binary_writer_test.cpp +++ b/src/test/lib/import_export/binary/binary_writer_test.cpp @@ -106,6 +106,65 @@ TEST_F(BinaryWriterTest, FixedStringDictionaryMultipleChunks) { reference_filepath + ::testing::UnitTest::GetInstance()->current_test_info()->name() + ".bin", filename)); } +TEST_F(BinaryWriterTest, VariableStringDictionarySingleChunk) { + TableColumnDefinitions column_definitions; + column_definitions.emplace_back("a", DataType::String, false); + + auto table = std::make_shared
(column_definitions, TableType::Data, ChunkOffset{10}); + table->append({"This"}); + table->append({"is"}); + table->append({"a"}); + table->append({"test"}); + + table->last_chunk()->set_immutable(); + ChunkEncoder::encode_all_chunks(table, SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + BinaryWriter::write(*table, filename); + + EXPECT_TRUE(file_exists(filename)); + EXPECT_TRUE(compare_files( + reference_filepath + ::testing::UnitTest::GetInstance()->current_test_info()->name() + ".bin", filename)); +} + +TEST_F(BinaryWriterTest, VariableStringDictionaryNullValue) { + TableColumnDefinitions column_definitions; + column_definitions.emplace_back("a", DataType::String, true); + + auto table = std::make_shared
(column_definitions, TableType::Data, ChunkOffset{10}); + table->append({"This"}); + table->append({"is"}); + table->append({"a"}); + table->append({NULL_VALUE}); + table->append({"test"}); + table->append({NULL_VALUE}); + + table->last_chunk()->set_immutable(); + ChunkEncoder::encode_all_chunks(table, SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + BinaryWriter::write(*table, filename); + + EXPECT_TRUE(file_exists(filename)); + EXPECT_TRUE(compare_files( + reference_filepath + ::testing::UnitTest::GetInstance()->current_test_info()->name() + ".bin", filename)); +} + +TEST_F(BinaryWriterTest, VariableStringDictionaryMultipleChunks) { + TableColumnDefinitions column_definitions; + column_definitions.emplace_back("a", DataType::String, false); + + auto table = std::make_shared
(column_definitions, TableType::Data, ChunkOffset{3}); + table->append({"This"}); + table->append({"is"}); + table->append({"a"}); + table->append({"test"}); + + table->last_chunk()->set_immutable(); + ChunkEncoder::encode_all_chunks(table, SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + BinaryWriter::write(*table, filename); + + EXPECT_TRUE(file_exists(filename)); + EXPECT_TRUE(compare_files( + reference_filepath + ::testing::UnitTest::GetInstance()->current_test_info()->name() + ".bin", filename)); +} + TEST_F(BinaryWriterTest, NullValuesFrameOfReferenceSegment) { TableColumnDefinitions column_definitions; column_definitions.emplace_back("a", DataType::Int, true); diff --git a/src/test/lib/operators/table_scan_string_test.cpp b/src/test/lib/operators/table_scan_string_test.cpp index 41227f0311..ca5acecdbf 100644 --- a/src/test/lib/operators/table_scan_string_test.cpp +++ b/src/test/lib/operators/table_scan_string_test.cpp @@ -61,7 +61,8 @@ class OperatorsTableScanStringTest : public BaseTest, public ::testing::WithPara INSTANTIATE_TEST_SUITE_P(EncodingTypes, OperatorsTableScanStringTest, ::testing::Values(EncodingType::Unencoded, EncodingType::Dictionary, - EncodingType::FixedStringDictionary, EncodingType::RunLength), + EncodingType::FixedStringDictionary, EncodingType::VariableStringDictionary, + EncodingType::RunLength), enum_formatter); TEST_P(OperatorsTableScanStringTest, ScanEquals) { diff --git a/src/test/lib/storage/encoded_string_segment_test.cpp b/src/test/lib/storage/encoded_string_segment_test.cpp index 0afe6a7c21..5170db92b3 100644 --- a/src/test/lib/storage/encoded_string_segment_test.cpp +++ b/src/test/lib/storage/encoded_string_segment_test.cpp @@ -305,6 +305,9 @@ TEST_F(EncodedStringSegmentTest, SegmentReencoding) { value_segment, DataType::String, SegmentEncodingSpec{EncodingType::FixedStringDictionary, VectorCompressionType::BitPacking}); EXPECT_SEGMENT_EQ_ORDERED(value_segment, encoded_segment); + encoded_segment = this->_encode_segment(value_segment, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + EXPECT_SEGMENT_EQ_ORDERED(value_segment, encoded_segment); encoded_segment = this->_encode_segment(value_segment, DataType::String, SegmentEncodingSpec{EncodingType::LZ4}); EXPECT_SEGMENT_EQ_ORDERED(value_segment, encoded_segment); diff --git a/src/test/lib/storage/variable_string_dictionary_segment_test.cpp b/src/test/lib/storage/variable_string_dictionary_segment_test.cpp new file mode 100644 index 0000000000..465befcfe9 --- /dev/null +++ b/src/test/lib/storage/variable_string_dictionary_segment_test.cpp @@ -0,0 +1,234 @@ +#include +#include + +#include "base_test.hpp" +#include "storage/chunk_encoder.hpp" +#include "storage/create_iterable_from_segment.hpp" +#include "storage/value_segment.hpp" +#include "storage/variable_string_dictionary/variable_string_vector.hpp" +#include "storage/variable_string_dictionary/variable_string_vector_iterator.hpp" +#include "storage/variable_string_dictionary_segment.hpp" +#include "storage/vector_compression/fixed_width_integer/fixed_width_integer_vector.hpp" + +namespace hyrise { + +class StorageVariableStringDictionarySegmentTest : public BaseTest { + protected: + std::shared_ptr> vs_str = std::make_shared>(); +}; + +TEST_F(StorageVariableStringDictionarySegmentTest, CompressSegmentString) { + vs_str->append("Bill"); + vs_str->append("Steve"); + vs_str->append("Alexander"); + vs_str->append("Steve"); + vs_str->append("Hasso"); + vs_str->append("Bill"); + + auto segment = ChunkEncoder::encode_segment(vs_str, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto dict_segment = std::dynamic_pointer_cast>(segment); + + // Test attribute_vector size. + EXPECT_EQ(dict_segment->size(), 6u); + EXPECT_EQ(dict_segment->attribute_vector()->size(), 6u); + + // Test dictionary size (uniqueness). + EXPECT_EQ(dict_segment->unique_values_count(), 4u); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, Decode) { + vs_str->append("Bill"); + vs_str->append("Steve"); + vs_str->append("Bill"); + + const auto segment = ChunkEncoder::encode_segment(vs_str, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto dict_segment = std::dynamic_pointer_cast>(segment); + + EXPECT_EQ(dict_segment->encoding_type(), EncodingType::VariableStringDictionary); + EXPECT_EQ(dict_segment->compressed_vector_type(), CompressedVectorType::FixedWidthInteger1Byte); + + // Decode values. + EXPECT_EQ((*dict_segment)[ChunkOffset{0}], AllTypeVariant("Bill")); + EXPECT_EQ((*dict_segment)[ChunkOffset{1}], AllTypeVariant("Steve")); + EXPECT_EQ((*dict_segment)[ChunkOffset{2}], AllTypeVariant("Bill")); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, LowerUpperBound) { + vs_str->append("A"); + vs_str->append("C"); + vs_str->append("E"); + vs_str->append("G"); + vs_str->append("I"); + vs_str->append("K"); + + const auto segment = ChunkEncoder::encode_segment(vs_str, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto dict_segment = std::dynamic_pointer_cast>(segment); + + // Test for AllTypeVariant as parameter. + EXPECT_EQ(dict_segment->lower_bound(AllTypeVariant("E")), ValueID{2}); + EXPECT_EQ(dict_segment->upper_bound(AllTypeVariant("E")), ValueID{3}); + + EXPECT_EQ(dict_segment->lower_bound(AllTypeVariant("F")), ValueID{3}); + EXPECT_EQ(dict_segment->upper_bound(AllTypeVariant("F")), ValueID{3}); + + EXPECT_EQ(dict_segment->lower_bound(AllTypeVariant("Z")), INVALID_VALUE_ID); + EXPECT_EQ(dict_segment->upper_bound(AllTypeVariant("Z")), INVALID_VALUE_ID); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, NullValues) { + const auto vs_str = std::make_shared>(true); + + vs_str->append("A"); + vs_str->append(NULL_VALUE); + vs_str->append("E"); + + const auto segment = ChunkEncoder::encode_segment(vs_str, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto dict_segment = std::dynamic_pointer_cast>(segment); + + EXPECT_EQ(dict_segment->null_value_id(), 2u); + EXPECT_TRUE(variant_is_null((*dict_segment)[ChunkOffset{1}])); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, MemoryUsageEstimation) { + /** + * WARNING: Since it's hard to assert what constitutes a correct "estimation", this just tests basic sanity of the + * memory usage estimations + */ + const auto empty_compressed_segment = ChunkEncoder::encode_segment( + vs_str, DataType::String, SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto empty_dictionary_segment = + std::dynamic_pointer_cast>(empty_compressed_segment); + const auto empty_memory_usage = empty_dictionary_segment->memory_usage(MemoryUsageCalculationMode::Full); + + vs_str->append("A"); + vs_str->append("B"); + vs_str->append("C"); + const auto compressed_segment = ChunkEncoder::encode_segment( + vs_str, DataType::String, SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto dictionary_segment = + std::dynamic_pointer_cast>(compressed_segment); + + static constexpr auto size_of_attribute_vector_entry = 1u; + // 3u for letters and 3u for null terminators + static constexpr auto size_of_dictionary = 6u; + static constexpr auto size_of_offset_vector = sizeof(uint32_t); + + EXPECT_EQ(dictionary_segment->memory_usage(MemoryUsageCalculationMode::Full), + empty_memory_usage + 3 * (size_of_attribute_vector_entry + size_of_offset_vector) + size_of_dictionary); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, TestOffsetVector) { + vs_str->append("ThisIsAVeryLongStringThisIsAVeryLongStringThisIsAVeryLongString"); + vs_str->append("QuiteShort"); + vs_str->append("Short"); + + const auto segment = ChunkEncoder::encode_segment(vs_str, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto dict_segment = std::dynamic_pointer_cast>(segment); + const auto offset_vector = dict_segment->offset_vector(); + EXPECT_EQ(offset_vector->size(), 3); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, TestLookup) { + const auto allocator = PolymorphicAllocator{}; + // Create string data for clob. + // Contains zero-length string at the end, just to be annoying. + const auto data = std::array{"Hello\0World\0Alexander\0String\0"}; + const auto clob = std::make_shared>(); + const auto clob_size = data.size(); + clob->resize(clob_size); + std::memcpy(clob->data(), data.data(), clob_size); + const pmr_vector offsets{0, 6, 12, 22, 29}; + const pmr_vector attribute_vector{0, 0, 1, 3, 2, 4, 2}; + + const auto segment = VariableStringDictionarySegment{ + clob, + std::shared_ptr( + compress_vector(attribute_vector, VectorCompressionType::FixedWidthInteger, allocator, {4})), + std::make_shared>(offsets)}; + + const auto accessors = + std::vector&, const ChunkOffset)>>{ + +[](const VariableStringDictionarySegment& segment, const ChunkOffset offset) { + const auto maybe = segment.get_typed_value(offset); + return maybe ? maybe.value() : NULL_VALUE; + }, + +[](const VariableStringDictionarySegment& segment, const ChunkOffset offset) { + return segment[offset]; + }}; + + for (const auto& accessor : accessors) { + EXPECT_EQ(accessor(segment, ChunkOffset{0}), AllTypeVariant{"Hello"}); + EXPECT_EQ(accessor(segment, ChunkOffset{1}), AllTypeVariant{"Hello"}); + EXPECT_EQ(accessor(segment, ChunkOffset{2}), AllTypeVariant{"World"}); + EXPECT_EQ(accessor(segment, ChunkOffset{3}), AllTypeVariant{"String"}); + EXPECT_EQ(accessor(segment, ChunkOffset{4}), AllTypeVariant{"Alexander"}); + EXPECT_EQ(accessor(segment, ChunkOffset{5}), AllTypeVariant{""}); + EXPECT_EQ(accessor(segment, ChunkOffset{6}), AllTypeVariant{"Alexander"}); + } +} + +TEST_F(StorageVariableStringDictionarySegmentTest, TestIterable) { + const auto value_segment = std::make_shared>(true); + value_segment->append("Bill"); + value_segment->append(""); + value_segment->append("Steve"); + value_segment->append(NULL_VALUE); + value_segment->append("Bill"); + + const auto segment = ChunkEncoder::encode_segment(value_segment, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto dict_segment = std::dynamic_pointer_cast>(segment); + const auto iterable = create_iterable_from_segment(*dict_segment); + auto current_chunk_offset = ChunkOffset{0}; + + iterable.for_each([&](const auto& value) { + const auto expected_value = value_segment->operator[](current_chunk_offset); + current_chunk_offset++; + if (variant_is_null(expected_value)) { + EXPECT_TRUE(value.is_null()); + return; + } + ASSERT_FALSE(value.is_null()); + EXPECT_EQ(value.value(), boost::get(expected_value)); + }); +} + +TEST_F(StorageVariableStringDictionarySegmentTest, TestVectorIterator) { + const auto value_segment = std::make_shared>(true); + value_segment->append("Bill"); + value_segment->append(""); + value_segment->append("Steve"); + value_segment->append(NULL_VALUE); + value_segment->append("Bill"); + + const auto segment = ChunkEncoder::encode_segment(value_segment, DataType::String, + SegmentEncodingSpec{EncodingType::VariableStringDictionary}); + const auto dict_segment = std::dynamic_pointer_cast>(segment); + const auto variable_string_vector = dict_segment->variable_string_dictionary(); + auto it = variable_string_vector->begin(); + + EXPECT_EQ("", *it++); + EXPECT_EQ("Bill", *it++); + EXPECT_EQ("Steve", *it++); + + const auto first = variable_string_vector->begin(); + const auto second = first + 1; + auto third = first + 2; + + EXPECT_EQ("", *first); + EXPECT_EQ("Bill", *second); + EXPECT_EQ("Steve", *third); + + EXPECT_EQ(1, second - first); + EXPECT_EQ(2, third - first); + EXPECT_EQ(-1, first - second); + + EXPECT_EQ("Bill", *--third); +} + +} // namespace hyrise diff --git a/src/test/lib/utils/print_utils_test.cpp b/src/test/lib/utils/print_utils_test.cpp index 2f9f37b2e0..b652199ec3 100644 --- a/src/test/lib/utils/print_utils_test.cpp +++ b/src/test/lib/utils/print_utils_test.cpp @@ -122,7 +122,8 @@ TEST_F(PrintUtilsTest, print_expressions) { } TEST_F(PrintUtilsTest, all_encoding_options) { - EXPECT_EQ(all_encoding_options(), "Unencoded, Dictionary, RunLength, FixedStringDictionary, FrameOfReference, LZ4"); + EXPECT_EQ(all_encoding_options(), + "Unencoded, Dictionary, RunLength, FixedStringDictionary, FrameOfReference, LZ4, VariableStringDictionary"); } } // namespace hyrise