Skip to content

Commit 9441ce0

Browse files
committed
Fix: scheduling: overhaul the whole thing
- prevent possible lockup when format in proc changes - properly get and handle scheduler policy & prio - recognize and try to handle cgroup-v2 similarly - on SCHED_RR failing push to the max with SCHED_OTHER
1 parent 030befe commit 9441ce0

File tree

2 files changed

+115
-21
lines changed

2 files changed

+115
-21
lines changed

src/sbd-common.c

Lines changed: 101 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
#include <pwd.h>
2727
#include <unistd.h>
2828
#include <dirent.h>
29+
#include <sys/time.h>
30+
#include <sys/resource.h>
31+
#include <limits.h>
2932

3033
#ifdef _POSIX_MEMLOCK
3134
# include <sys/mman.h>
@@ -298,7 +301,7 @@ watchdog_populate_list(void)
298301
FILE *file;
299302

300303
snprintf(entry_name, sizeof(entry_name),
301-
SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name);
304+
SYS_CLASS_WATCHDOG "/%s/dev", entry->d_name);
302305
file = fopen(entry_name, "r");
303306
if (file) {
304307
int major, minor;
@@ -667,7 +670,7 @@ static int get_realtime_budget(void)
667670
{
668671
FILE *f;
669672
char fname[PATH_MAX];
670-
int res = -1, lnum = 0;
673+
int res = -1, lnum = 0, num;
671674
char *cgroup = NULL, *namespecs = NULL;
672675

673676
snprintf(fname, PATH_MAX, "/proc/%jd/cgroup", (intmax_t)getpid());
@@ -677,7 +680,8 @@ static int get_realtime_budget(void)
677680
(intmax_t)getpid());
678681
goto exit_res;
679682
}
680-
while( fscanf(f, "%d:%m[^:]:%m[^\n]", &lnum, &namespecs, &cgroup) !=EOF ) {
683+
while( (num = fscanf(f, "%d:%m[^:]:%m[^\n]\n", &lnum,
684+
&namespecs, &cgroup)) !=EOF ) {
681685
if (namespecs && strstr(namespecs, "cpuacct")) {
682686
free(namespecs);
683687
break;
@@ -690,6 +694,11 @@ static int get_realtime_budget(void)
690694
free(namespecs);
691695
namespecs = NULL;
692696
}
697+
/* not to get stuck if format changes */
698+
if ((num < 3) && ((fscanf(f, "%*[^\n]") == EOF) ||
699+
(fscanf(f, "\n") == EOF))) {
700+
break;
701+
}
693702
}
694703
fclose(f);
695704
if (cgroup == NULL) {
@@ -720,9 +729,15 @@ static int get_realtime_budget(void)
720729
}
721730

722731
/* stolen from corosync */
732+
733+
#define LEGACY_CGROUP_PROC_PIDS "/sys/fs/cgroup/cpu/tasks"
734+
#define UNIFIED_CGROUP_PROC_PIDS "/sys/fs/cgroup/cgroup.procs"
735+
723736
static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) {
724737
FILE *f;
725-
int res = -1;
738+
int res = -1, num;
739+
char *rt_rq_name = NULL;
740+
const char *root_pids = LEGACY_CGROUP_PROC_PIDS;
726741

727742
/*
728743
* /sys/fs/cgroup is hardcoded, because most of Linux distributions are now
@@ -731,13 +746,53 @@ static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) {
731746
* This feature is expected to be removed as soon as systemd gets support
732747
* for managing RT configuration.
733748
*/
734-
f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt");
735-
if (f == NULL) {
736-
cl_log(LOG_DEBUG, "cpu.rt_runtime_us doesn't exist -> "
749+
do {
750+
f = fopen("/sys/fs/cgroup/cpu/cpu.rt_runtime_us", "rt");
751+
if (f) {
752+
break;
753+
}
754+
/* CONFIG_RT_GROUP_SCHED might still be enabled with cgroup-v2
755+
cgroup.procs on cgroup-toplevel tells us we have cgroup-v2
756+
(handy as we already need that to be in selinux-policy)
757+
and name of rt_rq(s) in /proc/sched_debug tells us that
758+
CONFIG_RT_GROUP_SCHED is enabled
759+
cgroup-v2 has been around for a while in the kernel and it
760+
is no mutual exclusive compile-time-configuration - so
761+
checking what is actually mounted to go with what is there
762+
*/
763+
f = fopen(UNIFIED_CGROUP_PROC_PIDS, "rt");
764+
if (f) {
765+
fclose(f);
766+
f = fopen("/proc/sched_debug", "rt");
767+
if (f) {
768+
while (((num = fscanf(f, "rt_rq[%*[^]]]:%m[^\n]\n",
769+
&rt_rq_name)) != EOF) &&
770+
(rt_rq_name == NULL)) {
771+
/* consume a line */
772+
if ((num > 0) || (fscanf(f, "%*[^\n]") == EOF) ||
773+
(fscanf(f, "\n") == EOF)) {
774+
break;
775+
}
776+
}
777+
/* no hierarchical rt-budget distribution with
778+
cgroup-v2 so far - thus checking for budget is
779+
useless
780+
*/
781+
if (rt_rq_name) {
782+
free(rt_rq_name);
783+
enforce_root_cgroup = true;
784+
root_pids = UNIFIED_CGROUP_PROC_PIDS;
785+
break;
786+
}
787+
fclose(f);
788+
}
789+
}
790+
cl_log(LOG_DEBUG, "cpu.rt_runtime_us doesn't exist & "
791+
"/proc/sched_debug doesn't contain rt_rq[...]:/ -> "
737792
"system without cgroup or with disabled CONFIG_RT_GROUP_SCHED");
738793
res = 0;
739794
goto exit_res;
740-
}
795+
} while (0);
741796
fclose(f);
742797

743798
if ((!enforce_root_cgroup) && (get_realtime_budget() > 0)) {
@@ -747,21 +802,23 @@ static int sbd_move_to_root_cgroup(bool enforce_root_cgroup) {
747802
goto exit_res;
748803
}
749804

750-
f = fopen("/sys/fs/cgroup/cpu/tasks", "w");
805+
f = fopen(root_pids, "w");
751806
if (f == NULL) {
752-
cl_log(LOG_WARNING, "Can't open cgroups tasks file for writing");
807+
cl_log(LOG_WARNING, "Can't open %s for writing", root_pids);
753808

754809
goto exit_res;
755810
}
756811

757812
if (fprintf(f, "%jd\n", (intmax_t)getpid()) <= 0) {
758-
cl_log(LOG_WARNING, "Can't write sbd pid into cgroups tasks file");
813+
cl_log(LOG_WARNING, "Can't write sbd pid into %s", root_pids);
759814
goto close_and_exit_res;
760815
}
761816

817+
res = 0;
818+
762819
close_and_exit_res:
763820
if (fclose(f) != 0) {
764-
cl_log(LOG_WARNING, "Can't close cgroups tasks file");
821+
cl_log(LOG_WARNING, "Can't close %s", root_pids);
765822
goto exit_res;
766823
}
767824

@@ -776,15 +833,17 @@ sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
776833
return;
777834
}
778835

836+
do {
779837
#ifdef SCHED_RR
780838
if (move_to_root_cgroup) {
781839
sbd_move_to_root_cgroup(enforce_moving_to_root_cgroup);
782840
}
783841

784842
{
785-
int pcurrent = 0;
786843
int pmin = sched_get_priority_min(SCHED_RR);
787844
int pmax = sched_get_priority_max(SCHED_RR);
845+
struct sched_param sp;
846+
int pcurrent;
788847

789848
if (priority == 0) {
790849
priority = pmax;
@@ -794,26 +853,47 @@ sbd_make_realtime(int priority, int stackgrowK, int heapgrowK)
794853
priority = pmax;
795854
}
796855

797-
pcurrent = sched_getscheduler(0);
798-
if (pcurrent < 0) {
856+
if (sched_getparam(0, &sp) < 0) {
799857
cl_perror("Unable to get scheduler priority");
800858

801-
} else if(pcurrent < priority) {
802-
struct sched_param sp;
859+
} else if ((pcurrent = sched_getscheduler(0)) < 0) {
860+
cl_perror("Unable to get scheduler policy");
803861

862+
} else if ((pcurrent == SCHED_RR) &&
863+
(sp.sched_priority >= priority)) {
864+
cl_log(LOG_INFO,
865+
"Stay with priority (%d) for policy SCHED_RR",
866+
sp.sched_priority);
867+
break;
868+
} else {
804869
memset(&sp, 0, sizeof(sp));
805870
sp.sched_priority = priority;
806871

807872
if (sched_setscheduler(0, SCHED_RR, &sp) < 0) {
808-
cl_perror("Unable to set scheduler priority to %d", priority);
873+
cl_perror(
874+
"Unable to set scheduler policy to SCHED_RR priority %d",
875+
priority);
809876
} else {
810-
cl_log(LOG_INFO, "Scheduler priority is now %d", priority);
877+
cl_log(LOG_INFO,
878+
"Scheduler policy is now SCHED_RR priority %d",
879+
priority);
880+
break;
811881
}
812882
}
813883
}
814884
#else
815-
cl_log(LOG_ERR, "System does not support updating the scheduler priority");
885+
cl_log(LOG_ERR, "System does not support updating the scheduler policy");
886+
#endif
887+
#ifdef PRIO_PGRP
888+
if (setpriority(PRIO_PGRP, 0, INT_MIN) < 0) {
889+
cl_perror("Unable to raise the scheduler priority");
890+
} else {
891+
cl_log(LOG_INFO, "Scheduler priority raised to the maximum");
892+
}
893+
#else
894+
cl_perror("System does not support setting the scheduler priority");
816895
#endif
896+
} while (0);
817897

818898
sbd_memlock(heapgrowK, stackgrowK);
819899
}
@@ -826,7 +906,7 @@ maximize_priority(void)
826906
return;
827907
}
828908

829-
sbd_make_realtime(0, 256, 256);
909+
sbd_make_realtime(0, 256, 256);
830910

831911
if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(),
832912
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 1)) != 0) {

src/sbd.sysconfig

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,20 @@ SBD_TIMEOUT_ACTION=flush,reboot
104104
# If that is the case sbd will stay in that slice while it will
105105
# be moved to root-slice otherwise.
106106
#
107+
# With cgroup-v2 behavior is very much different.
108+
# With CONFIG_RT_GROUP_SCHED enabled and cpu-controller enabled
109+
# there currently is no way to configure RT-budget in any slice
110+
# but the root-slice. Otherway round if there is RT-budget used
111+
# in any but the root-slice enabling the cpu-controller is
112+
# inhibited.
113+
# Thus - unless strictly disabled by setting 'no' - with cgroup-v2
114+
# and CONFIG_RT_GROUP_SCHED enabled sbd is always moved
115+
# to the root-slice regardless if the cpu-controller is at the
116+
# moment enabled or not.
117+
# Reason is that subsequent services might enable the cpu-controller
118+
# or fail doing so if sbd was already using RT-budget in e.g. the
119+
# system-slice.
120+
#
107121
SBD_MOVE_TO_ROOT_CGROUP=auto
108122

109123
## Type: string

0 commit comments

Comments
 (0)