Needed changes for census profiling

Including reporting script for detecting performance violations
single-cell-data · Feb 2, 2024 · 64c87aa · 64c87aa
1 parent 1ffa40b
commit 64c87aa
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 14 deletions.
diff --git a/profile_report.py b/profile_report.py
@@ -0,0 +1,44 @@
+from profiler import data
+import argparse
+
+# The script takes a command and a database path and looks
+# the performance anomalies in the performance history of that
+# command across the profiled runs.
+
+parser = argparse.ArgumentParser()
+parser.add_argument("benchmark", type=str)
+parser.add_argument("db_path", type=str)
+
+args = parser.parse_args()
+
+# Processes the set of previously written logs
+# The threshold (ratio) of allowable performance degradation between profiling runs
+threshold = 1.10  # Percent difference
+
+db = data.FileBasedProfileDB(args.db_path)
+dt = db.find(f"{args.benchmark}")
+
+
+if len(dt) >= 2:
+    first_profile = dt[0]
+    curr_profile = dt[len(dt) - 1]
+    first_time = first_profile.user_time_sec
+    curr_time =  curr_profile.user_time_sec
+
+    formatted_first_profile = str(first_profile).replace('\\n', '\n').replace('\\t', '\t')
+    formatted_curr_profile = str(curr_profile).replace('\\n', '\n').replace('\\t', '\t')
+
+    if float(curr_time) > threshold * float(first_time):
+        print(f"*** First profile:\n {formatted_first_profile}")
+        print(f"*** Current profile:\n {formatted_curr_profile}")
+        print(f"Major performance increase detected on {args.benchmark}: curr: {first_time} vs first: {curr_time}")
+        raise SystemExit(f"Potential performance degradation detected on {args.benchmark}: curr: {first_time} vs first: {curr_time}")
+
+    if threshold * float(curr_time) < float(first_time):
+        print(f"Major performance increase detected on {args.benchmark}: curr: {first_time} vs first: {curr_time}")
+
+    print(f"*** First profile:\n {formatted_first_profile}")
+    print(f"*** Current profile:\n {formatted_curr_profile}")
+    print(
+        f"TileDB version ver = first: {first_profile.tiledbsoma_version} curr: {curr_profile.tiledbsoma_version}"
+    )
diff --git a/profiler/__init__.py b/profiler/__init__.py
@@ -0,0 +1,5 @@
+from . import data
+
+__all__ = [
+    "data",
+]
diff --git a/profiler/__main__.py b/profiler/__main__.py
@@ -1,4 +1,4 @@
-from .profiler import main
+import profiler
 
 if __name__ == "__main__":
-    main()
+    profiler.main()
diff --git a/profiler/data.py b/profiler/data.py
@@ -87,6 +87,7 @@ class FileBasedProfileDB(ProfileDB):
 
     def __init__(self, path: str = DEFAULT_PROFILE_DB_PATH):
         self.path = path
+        print("PATH ====> " + path)
         if not os.path.exists(self.path):
             os.mkdir(self.path)
 
@@ -111,17 +112,22 @@ def find(self, command) -> List[ProfileData]:
         for filename in glob.glob(f"{self.path}/{key}/*.json"):
             with open(filename, "r") as file:
                 result.append(ProfileData(**json.load(file)))
+                print(f"Find: loading the file as read {file}")
         return result
 
     def add(self, data: ProfileData) -> str:
         key = _command_key(data.command)
         os.makedirs(f"{self.path}/{key}", exist_ok=True)
-        with open(f"{self.path}/{key}/command.txt", "w") as f:
-            f.write(data.command.strip())
+        filename = f"{self.path}/{key}/command.txt"
 
+        if not os.path.exists(filename):
+            with open(filename, "w") as f:
+                f.write(data.command.strip())
         key2 = data.timestamp
 
         filename = f"{self.path}/{key}/{key2}.json"
+        if os.path.exists(filename):
+            os.remove(filename)
         with open(filename, "w") as f:
             json.dump(attr.asdict(data), f)
 

diff --git a/profiler/profiler.py b/profiler/profiler.py
@@ -13,7 +13,8 @@
 from context_generator import host_context
 
 import tiledbsoma
-from data import FileBasedProfileDB, ProfileData, ProfileDB
+import context_generator
+import data as pdata
 
 GNU_TIME_FORMAT = (
     'Command being timed: "%C"\n'
@@ -74,7 +75,7 @@
 
 def build_profile_data(
     stderr_: str, stdout_: str, prof1: Optional[str], prof2: Optional[str]
-) -> ProfileData:
+) -> pdata.ProfileData:
     """Parse the time utility output to extract performance and memory metrics"""
     gnu_time_output_values = GNU_TIME_OUTPUT_REGEXP.search(stderr_)
     assert gnu_time_output_values
@@ -85,17 +86,18 @@ def build_profile_data(
         {k: int(v) for k, v in gnu_time_output_values.items() if v.isdigit()}
     )
 
-    data = ProfileData(
+    data = pdata.ProfileData(
         stdout=stdout_,
         stderr=stderr_,
         timestamp=datetime.utcnow().timestamp(),
         tiledb_stats=read_tiledb_stats_output(),
         somacore_version=somacore.__version__,
         tiledbsoma_version=tiledbsoma.__version__,
-        host_context=host_context(),
+        host_context=context_generator.host_context(),
         custom_out=[prof1, prof2],
         **gnu_time_output_values,
     )
+
     return data
 
 
@@ -109,14 +111,18 @@ def read_tiledb_stats_output() -> Dict[str, Any]:
 
 
 def main():
-    data_columns = ", ".join([a for a in dir(ProfileData) if a[0] != "_"])
+    data_columns = ", ".join([a for a in dir(pdata.ProfileData) if a[0] != "_"])
     parser = argparse.ArgumentParser(
         epilog=f"The list of collected metrics by the generic profiler: {data_columns}"
     )
     parser.add_argument(
         "command",
         help="The command and its arguments to be profiled (as quoted, single-argument)",
     )
+    parser.add_argument(
+        "db_path",
+        help="FileDB Path",
+    )
     parser.add_argument(
         "-t",
         "--gtime-cmd",
@@ -183,21 +189,22 @@ def main():
             print(
                 f"Third profiler {args.prof2} missing output flamegraph file location"
             )
-
+    cargs = args.command.split(" ")
     p_stdout, p_stderr = p.communicate()
     if p1 is not None:
         p1.wait()
     if p2 is not None:
         p2.wait()
+    print(f"Done profiler run", file=stderr)
 
     p_stdout = p_stdout.decode("utf-8")
     print(f"The benchmarked process output:\n {p_stdout}", file=stderr)
     # Parse the generated output from the time utility
-    data: ProfileData = build_profile_data(
+    data: pdata.ProfileData = build_profile_data(
         p_stderr.decode("utf-8"), p_stdout, args.prof1_output, args.prof2_output
     )
     # Add the run data to DB
-    db: ProfileDB = FileBasedProfileDB()
+    db: pdata.ProfileDB = pdata.FileBasedProfileDB(args.db_path)
     db_record_file = db.add(data)
     db.close()
 
@@ -206,6 +213,5 @@ def main():
         file=stderr,
     )
 
-
 if __name__ == "__main__":
     main()
diff --git a/profiler/setup.py b/profiler/setup.py
@@ -1,7 +1,7 @@
 from setuptools import find_packages, setup
 
 setup(
-    name="soma-profiler",
+    name="profiler",
     version="1.0",
     packages=find_packages(),
     requires=["gitpython", "psutil", "tiledbsoma", "cellxgene_census"],