Skip to content

Commit

Permalink
Needed changes for census profiling
Browse files Browse the repository at this point in the history
Including reporting script for detecting performance violations
  • Loading branch information
beroy committed Feb 2, 2024
1 parent 1ffa40b commit 64c87aa
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 14 deletions.
44 changes: 44 additions & 0 deletions profile_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from profiler import data
import argparse

# The script takes a command and a database path and looks
# the performance anomalies in the performance history of that
# command across the profiled runs.

parser = argparse.ArgumentParser()
parser.add_argument("benchmark", type=str)
parser.add_argument("db_path", type=str)

args = parser.parse_args()

# Processes the set of previously written logs
# The threshold (ratio) of allowable performance degradation between profiling runs
threshold = 1.10 # Percent difference

db = data.FileBasedProfileDB(args.db_path)
dt = db.find(f"{args.benchmark}")


if len(dt) >= 2:
first_profile = dt[0]
curr_profile = dt[len(dt) - 1]
first_time = first_profile.user_time_sec
curr_time = curr_profile.user_time_sec

formatted_first_profile = str(first_profile).replace('\\n', '\n').replace('\\t', '\t')
formatted_curr_profile = str(curr_profile).replace('\\n', '\n').replace('\\t', '\t')

if float(curr_time) > threshold * float(first_time):
print(f"*** First profile:\n {formatted_first_profile}")
print(f"*** Current profile:\n {formatted_curr_profile}")
print(f"Major performance increase detected on {args.benchmark}: curr: {first_time} vs first: {curr_time}")
raise SystemExit(f"Potential performance degradation detected on {args.benchmark}: curr: {first_time} vs first: {curr_time}")

if threshold * float(curr_time) < float(first_time):
print(f"Major performance increase detected on {args.benchmark}: curr: {first_time} vs first: {curr_time}")

print(f"*** First profile:\n {formatted_first_profile}")
print(f"*** Current profile:\n {formatted_curr_profile}")
print(
f"TileDB version ver = first: {first_profile.tiledbsoma_version} curr: {curr_profile.tiledbsoma_version}"
)
5 changes: 5 additions & 0 deletions profiler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from . import data

__all__ = [
"data",
]
4 changes: 2 additions & 2 deletions profiler/__main__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .profiler import main
import profiler

if __name__ == "__main__":
main()
profiler.main()
10 changes: 8 additions & 2 deletions profiler/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ class FileBasedProfileDB(ProfileDB):

def __init__(self, path: str = DEFAULT_PROFILE_DB_PATH):
self.path = path
print("PATH ====> " + path)
if not os.path.exists(self.path):
os.mkdir(self.path)

Expand All @@ -111,17 +112,22 @@ def find(self, command) -> List[ProfileData]:
for filename in glob.glob(f"{self.path}/{key}/*.json"):
with open(filename, "r") as file:
result.append(ProfileData(**json.load(file)))
print(f"Find: loading the file as read {file}")
return result

def add(self, data: ProfileData) -> str:
key = _command_key(data.command)
os.makedirs(f"{self.path}/{key}", exist_ok=True)
with open(f"{self.path}/{key}/command.txt", "w") as f:
f.write(data.command.strip())
filename = f"{self.path}/{key}/command.txt"

if not os.path.exists(filename):
with open(filename, "w") as f:
f.write(data.command.strip())
key2 = data.timestamp

filename = f"{self.path}/{key}/{key2}.json"
if os.path.exists(filename):
os.remove(filename)
with open(filename, "w") as f:
json.dump(attr.asdict(data), f)

Expand Down
24 changes: 15 additions & 9 deletions profiler/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from context_generator import host_context

import tiledbsoma
from data import FileBasedProfileDB, ProfileData, ProfileDB
import context_generator
import data as pdata

GNU_TIME_FORMAT = (
'Command being timed: "%C"\n'
Expand Down Expand Up @@ -74,7 +75,7 @@

def build_profile_data(
stderr_: str, stdout_: str, prof1: Optional[str], prof2: Optional[str]
) -> ProfileData:
) -> pdata.ProfileData:
"""Parse the time utility output to extract performance and memory metrics"""
gnu_time_output_values = GNU_TIME_OUTPUT_REGEXP.search(stderr_)
assert gnu_time_output_values
Expand All @@ -85,17 +86,18 @@ def build_profile_data(
{k: int(v) for k, v in gnu_time_output_values.items() if v.isdigit()}
)

data = ProfileData(
data = pdata.ProfileData(
stdout=stdout_,
stderr=stderr_,
timestamp=datetime.utcnow().timestamp(),
tiledb_stats=read_tiledb_stats_output(),
somacore_version=somacore.__version__,
tiledbsoma_version=tiledbsoma.__version__,
host_context=host_context(),
host_context=context_generator.host_context(),
custom_out=[prof1, prof2],
**gnu_time_output_values,
)

return data


Expand All @@ -109,14 +111,18 @@ def read_tiledb_stats_output() -> Dict[str, Any]:


def main():
data_columns = ", ".join([a for a in dir(ProfileData) if a[0] != "_"])
data_columns = ", ".join([a for a in dir(pdata.ProfileData) if a[0] != "_"])
parser = argparse.ArgumentParser(
epilog=f"The list of collected metrics by the generic profiler: {data_columns}"
)
parser.add_argument(
"command",
help="The command and its arguments to be profiled (as quoted, single-argument)",
)
parser.add_argument(
"db_path",
help="FileDB Path",
)
parser.add_argument(
"-t",
"--gtime-cmd",
Expand Down Expand Up @@ -183,21 +189,22 @@ def main():
print(
f"Third profiler {args.prof2} missing output flamegraph file location"
)

cargs = args.command.split(" ")
p_stdout, p_stderr = p.communicate()
if p1 is not None:
p1.wait()
if p2 is not None:
p2.wait()
print(f"Done profiler run", file=stderr)

p_stdout = p_stdout.decode("utf-8")
print(f"The benchmarked process output:\n {p_stdout}", file=stderr)
# Parse the generated output from the time utility
data: ProfileData = build_profile_data(
data: pdata.ProfileData = build_profile_data(
p_stderr.decode("utf-8"), p_stdout, args.prof1_output, args.prof2_output
)
# Add the run data to DB
db: ProfileDB = FileBasedProfileDB()
db: pdata.ProfileDB = pdata.FileBasedProfileDB(args.db_path)
db_record_file = db.add(data)
db.close()

Expand All @@ -206,6 +213,5 @@ def main():
file=stderr,
)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion profiler/setup.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from setuptools import find_packages, setup

setup(
name="soma-profiler",
name="profiler",
version="1.0",
packages=find_packages(),
requires=["gitpython", "psutil", "tiledbsoma", "cellxgene_census"],
Expand Down

0 comments on commit 64c87aa

Please sign in to comment.