Match concurrency to available CPU bandwidth

maxim-kuvyrkov · maxim-kuvyrkov · commit add119cd50e7 · 2021-04-28T17:22:13.000Z
This change allows ninja to throttle number of parallel tasks based
on feedback from cpuacct cgroup controller.  It extends "-l" parameter
to accept negative values; "-l-NN" means that ninja should limit
concurrency when processes in current cgroup spend more than NN% of
their time waiting for CPU slice.

E.g., running "ninja -j100 -l-10" on 32-core machine will quickly
settle on parallelism of 32-34.

This option is designed to make ninja use all CPU bandwidth available
to a cgroup-based container, while not starting excessive number of
processes, which could eat up all RAM.
diff --git a/src/build.cc b/src/build.cc
@@ -692,10 +692,19 @@ void RealCommandRunner::Abort() {
 
 bool RealCommandRunner::CanRunMore() const {
   size_t subproc_number =
-      subprocs_.running_.size() + subprocs_.finished_.size();
-  return (int)subproc_number < config_.parallelism
-    && ((subprocs_.running_.empty() || config_.max_load_average <= 0.0f)
-        || GetLoadAverage() < config_.max_load_average);
+    subprocs_.running_.size() + subprocs_.finished_.size();
+
+  if ((int)subproc_number >= config_.parallelism)
+    return false;
+
+  if (subprocs_.running_.empty())
+    return true;
+
+  if (config_.max_load_average > 0.0f)
+    return GetLoadAverage() < config_.max_load_average;
+  else
+    return (GetCPUWaitRatio(subproc_number, config_.parallelism)
+	    < -config_.max_load_average);
 }
 
 bool RealCommandRunner::StartCommand(Edge* edge) {
diff --git a/src/util.cc b/src/util.cc
@@ -48,6 +48,7 @@
 #elif defined(_AIX) && !defined(__PASE__)
 #include <libperfstat.h>
 #elif defined(linux) || defined(__GLIBC__)
+#include <fstream>
 #include <sys/sysinfo.h>
 #endif
 
@@ -597,6 +598,77 @@ double GetLoadAverage() {
 }
 #endif // _WIN32
 
+double GetCPUWaitRatio(size_t subproc_number, int parallelism) {
+#if defined(linux)
+  static double oncpu_ratio = 100.0;
+  static uint64_t prev_user(0), prev_system(0), prev_wall(0);
+
+  uint64_t user(0), system(0), wall(0);
+  string token;
+
+  // Fetch user, system and timestamp counters.
+  ifstream cpustat("/sys/fs/cgroup/cpuacct/cpuacct.stat", ifstream::in);
+  while (cpustat >> token) {
+    if (token == "user")
+      cpustat >> user;
+    else if (token == "system")
+      cpustat >> system;
+  }
+  ifstream schedstat("/proc/schedstat", ifstream::in);
+  while (schedstat >> token) {
+    if (token == "timestamp") {
+      schedstat >> wall;
+      break;
+    }
+  }
+
+  if (user > 0 && system > 0 && wall > 0)
+    {
+      uint64_t oncpu_ticks = (user - prev_user) + (system - prev_system);
+      uint64_t wall_ticks = (wall - prev_wall) * subproc_number;
+
+      // Adjust CONFIG_HZ vs _SC_CLK_TCK scaling.
+      // CONFIG_HZ is set to 250 in Ubuntu kernels, and I can't find a good
+      // way to query this setting from user-space.
+      oncpu_ticks *= 250; // CONFIG_HZ
+      wall_ticks *= sysconf(_SC_CLK_TCK);
+
+      // oncpu_ticks is the amount of time that processes in our cgroup
+      // spent on all CPUs combined since previous measurement.
+      // wall_ticks is the amount of time that elapsed since previous
+      // measurement scaled by number of current subprocesses.
+      //
+      // Therefore, wall_ticks is the time that subprocesses should have
+      // got on CPU with no contention, and oncpu_ticks is the time that
+      // they have actually were allowed to run up on all CPUs.
+
+      if (wall_ticks > 0) {
+	// Clock advanced, so update oncpu_ratio with latest measurments.
+	// Pass new measurements through a simple noise filter.
+	if (prev_user != 0) {
+	  oncpu_ratio *= ((double) subproc_number
+			  / (subproc_number + 1));
+	  oncpu_ratio += ((100.0 * oncpu_ticks / wall_ticks)
+			  / (subproc_number + 1));
+	}
+
+	prev_user = user;
+	prev_system = system;
+	prev_wall = wall;
+      } else
+	// Clock didn't advance, this usually happens during initial
+	// startup, when we start config_.parallelism tasks in rapid
+	// succession.  Slightly reduce oncpu_ratio to throttle startup
+	// of new processes until we get an updated measurement.
+	oncpu_ratio *= (double) parallelism / (parallelism + 1);
+    }
+
+  return 100.0 - oncpu_ratio;
+#else
+  return -1.0;
+#endif
+}
+
 string ElideMiddle(const string& str, size_t width) {
   switch (width) {
       case 0: return "";
diff --git a/src/util.h b/src/util.h
@@ -97,6 +97,10 @@ int GetProcessorCount();
 /// on error.
 double GetLoadAverage();
 
+/// @return percentage of time tasks are waiting for CPU.
+/// A negative value is returned for unsupported platforms.
+double GetCPUWaitRatio(size_t subproc_number, int parallelism);
+
 /// Elide the given string @a str with '...' in the middle if the length
 /// exceeds @a width.
 std::string ElideMiddle(const std::string& str, size_t width);