forked from alamb/datafusion-duckdb-benchmark
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsummary.py
107 lines (87 loc) · 3.71 KB
/
summary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Generates summary results from the data in
# latest/{benchmark}_{duckdb|datafusion}
from prettytable import PrettyTable
import os
def read_file(file_path):
data = dict()
with open(file_path, "r") as f:
lines = f.readlines()
for line in lines:
line = line.strip()
query_no, no_threads, iteration, time = line.split(",")
if no_threads == "1":
try:
if query_no not in data:
data[query_no] = float(time)
else:
data[query_no] += float(time)
except ValueError:
pass
for k, v in data.items():
data[k] = data[k] / 3
return data
# remove any keys that only appear in duckdb_results
def fix_duckdb_results(datafusion_results, duckdb_results):
new_duckdb_results = dict()
for k in datafusion_results.keys():
new_duckdb_results[k] = duckdb_results[k]
return datafusion_results, new_duckdb_results
if __name__ == "__main__":
# generate csv table to `latest/{overall.csv}`
#
# Example:
#
# Benchmark,Query,Datafusion,DuckDB
# tpch1,16.39,8.1,2.02x slower
csv_table = list()
csv_table.append("Benchmark,Query,Datafusion,DuckDB")
benches = ["tpch", "h2o", "clickbench"]
for bench in benches:
table = list()
if not os.path.exists(f"latest/{bench}_datafusion.csv"):
print("Did not find data for {bench}, skipping")
continue
datafusion_results = read_file(f"latest/{bench}_datafusion.csv")
duckdb_results = read_file(f"latest/{bench}_duckdb.csv")
datafusion_results, duckdb_results = fix_duckdb_results(datafusion_results, duckdb_results)
# track overall best/worst times
print(f"{bench}: ")
table.append(["Query", "DataFusion", "DuckDB", "Summary (Datafusion / DuckDB))"])
for df_key, duck_key in zip(datafusion_results.keys(), duckdb_results.keys()):
df_res = datafusion_results[df_key]
duck_res = duckdb_results[duck_key]
times = round(df_res / duck_res, 4)
if times > 1:
times = f"{round(times, 2)}x slower"
else:
times = f"{round(1/times, 2)}x faster"
table.append([df_key, round(df_res, 2), round(duck_res, 2), times])
tab = PrettyTable(table[0])
tab.add_rows(table[1:])
print(tab)
# generate latex table to `latest/{bench.tex`
latex_table = list()
latex_table.append("\\begin{table}[h]")
latex_table.append("\\centering")
latex_table.append("\\begin{tabular}{|l|l|l|l|}")
latex_table.append("\\hline")
latex_table.append("Query & \textbf{DataFusion} & \textbf{DuckDB} & Delta \\\\")
latex_table.append("\\hline")
for row in table[1:]:
latex_table.append(" & ".join([str(x) for x in row]) + " \\\\")
latex_table.append("\\hline")
latex_table.append("\\end{tabular}")
latex_table.append("\\caption{DataFusion vs DuckDB performance comparison}")
latex_table.append("\\label{table:1}")
latex_table.append("\\end{table}")
output_filename = f"latest/{bench}.tex"
with open(output_filename, "w") as f:
print("Writing tex based tables to {}".format(output_filename))
f.write("\n".join(latex_table))
# update the csv table
for row in table[1:]:
csv_table.append(bench + "," + ",".join([str(x) for x in row]))
output_filename = f"latest/overall.csv"
with open(output_filename, "w") as f:
print("Writing overall summary to {}".format(output_filename))
f.write("\n".join(csv_table))