[RFC 9/12][PATCH] SCHED_DEADLINE: system wide bandwidth management

From: Raistlin
Date: Fri Oct 16 2009 - 11:47:10 EST

Next message: Christopher Li: "Sparse 0.4.2 released"
Previous message: Raistlin: "[RFC 8/12][PATCH] SCHED_DEADLINE: wait next instance syscall added."
In reply to: Raistlin: "[RFC 8/12][PATCH] SCHED_DEADLINE: wait next instance syscall added."
Next in thread: Raistlin: "[RFC 11/12][PATCH] SCHED_DEADLINE: documentation"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This commit adds the capability of controlling the maximum, system wide,
CPU bandwidth that is devoted to SCHED_DEADLINE tasks.

This is done by means of two files:
- /proc/sys/kernel/sched_deadline_runtime_us,
- /proc/sys/kernel/sched_deadline_period_us.
The ratio runtime/period is the total bandwidth all the SCHED_DEADLINE tasks
can use in the system as a whole.
Trying to create tasks in such a way that they exceed this limitation will
fail, as soon as the bandwidth cap would be overcome.

Default value is _zero_ bandwidth available, thus write some numbers in those
files before trying to start some SCHED_DEADLINE task. Setting runtime > period
is allowed (i.e., more than 100% bandwidth available for -deadline tasks),
since it makes more than sense in SMP systems.

Signed-off-by: Raistlin <raistlin@xxxxxxxx>
---
include/linux/sched.h | 7 ++
kernel/sched.c | 149 ++++++++++++++++++++++++++++++++++++++++++++++++-
kernel/sysctl.c | 16 +++++
3 files changed, 171 insertions(+), 1 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 478e07c..4de72eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1984,6 +1984,13 @@ int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);

+extern unsigned int sysctl_sched_deadline_period;
+extern int sysctl_sched_deadline_runtime;
+
+int sched_deadline_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos);
+
extern unsigned int sysctl_sched_compat_yield;

#ifdef CONFIG_RT_MUTEXES
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c3e834..d8b6354 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -870,6 +870,34 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}

+/*
+ * deadline_runtime/deadline_period is the maximum bandwidth
+ * -deadline tasks can use. It is system wide, i.e., the sum
+ * of the bandwidths of all the tasks, inside every group and
+ * running on any CPU, has to stay below this value!
+ *
+ * default: 0s (= no bandwidth for -deadline tasks)
+ */
+unsigned int sysctl_sched_deadline_period = 0;
+int sysctl_sched_deadline_runtime = 0;
+
+static inline u64 global_deadline_period(void)
+{
+ return (u64)sysctl_sched_deadline_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_deadline_runtime(void)
+{
+ return (u64)sysctl_sched_deadline_runtime * NSEC_PER_USEC;
+}
+
+/*
+ * locking for the system wide deadline bandwidth management.
+ */
+static DEFINE_MUTEX(deadline_constraints_mutex);
+static DEFINE_SPINLOCK(__sysctl_sched_deadline_lock);
+static u64 __sysctl_sched_deadline_total_bw;
+
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
@@ -2606,6 +2634,66 @@ static unsigned long to_ratio(u64 period, u64 runtime)
return div64_u64(runtime << 20, period);
}

+static inline
+void __deadline_clear_task_bw(struct task_struct *p, u64 tsk_bw)
+{
+ __sysctl_sched_deadline_total_bw -= tsk_bw;
+}
+
+static inline
+void __deadline_add_task_bw(struct task_struct *p, u64 tsk_bw)
+{
+ __sysctl_sched_deadline_total_bw += tsk_bw;
+}
+
+/*
+ * update the total allocated bandwidth, if a new -deadline task arrives,
+ * leaves or stays, but modifies its bandwidth.
+ */
+static int __deadline_check_task_bw(struct task_struct *p, int policy,
+ struct sched_param_ex *param_ex)
+{
+ u64 bw, tsk_bw;
+ int ret = 0;
+
+ spin_lock(&__sysctl_sched_deadline_lock);
+
+ if (sysctl_sched_deadline_period <= 0)
+ goto unlock;
+
+ bw = to_ratio(sysctl_sched_deadline_period,
+ sysctl_sched_deadline_runtime);
+ if (bw <= 0)
+ return 0;
+
+ if (deadline_policy(policy))
+ tsk_bw = to_ratio(timespec_to_ns(&param_ex->sched_deadline),
+ timespec_to_ns(&param_ex->sched_runtime));
+
+ /*
+ * Either if a task, enters, leave, or stays deadline but chanes
+ * its parameters, we need to update accordingly the global
+ * deadline allocated bandwidth.
+ */
+ if (task_has_deadline_policy(p) && !deadline_policy(policy)) {
+ __deadline_clear_task_bw(p, p->dl.bw);
+ ret = 1;
+ } else if (task_has_deadline_policy(p) && deadline_policy(policy) &&
+ bw >= __sysctl_sched_deadline_total_bw - p->dl.bw + tsk_bw) {
+ __deadline_clear_task_bw(p, p->dl.bw);
+ __deadline_add_task_bw(p, tsk_bw);
+ ret = 1;
+ } else if (deadline_policy(policy) && !task_has_deadline_policy(p) &&
+ bw >= __sysctl_sched_deadline_total_bw + tsk_bw) {
+ __deadline_add_task_bw(p, tsk_bw);
+ ret = 1;
+ }
+unlock:
+ spin_unlock(&__sysctl_sched_deadline_lock);
+
+ return ret;
+}
+
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
@@ -2765,8 +2853,10 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
/* a deadline task is dying: stop the bandwidth timer */
- if (deadline_task(prev))
+ if (deadline_task(prev)) {
+ __deadline_clear_task_bw(prev, prev->dl.bw);
hrtimer_cancel(&prev->dl.dl_timer);
+ }

/*
* Remove function-return probe instances associated with this
@@ -6372,6 +6462,19 @@ recheck:
spin_unlock_irqrestore(&p->pi_lock, flags);
goto recheck;
}
+ /*
+ * If changing to SCHED_DEADLINE (or changing the parameters of a
+ * SCHED_DEADLINE task) we need to check if enough bandwidth is
+ * available, which might be not true!
+ */
+ if (deadline_policy(policy) || deadline_task(p)) {
+ if (!__deadline_check_task_bw(p, policy, param_ex)) {
+ __task_rq_unlock(rq);
+ spin_unlock_irqrestore(&p->pi_lock, flags);
+ return -EPERM;
+ }
+ }
+
update_rq_clock(rq);
on_rq = p->se.on_rq;
running = task_current(rq, p);
@@ -10569,6 +10672,25 @@ static int sched_rt_global_constraints(void)
}
#endif /* CONFIG_RT_GROUP_SCHED */

+static int sched_deadline_global_constraints(void)
+{
+ u64 bw;
+ int ret = 1;
+
+ spin_lock_irq(&__sysctl_sched_deadline_lock);
+ if (sysctl_sched_deadline_period <= 0)
+ bw = 0;
+ else
+ bw = to_ratio(global_deadline_period(),
+ global_deadline_runtime());
+
+ if (bw < __sysctl_sched_deadline_total_bw)
+ ret = 0;
+ spin_unlock_irq(&__sysctl_sched_deadline_lock);
+
+ return ret;
+}
+
int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -10599,6 +10721,31 @@ int sched_rt_handler(struct ctl_table *table, int write,
return ret;
}

+int sched_deadline_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret;
+ int old_period, old_runtime;
+
+ mutex_lock(&deadline_constraints_mutex);
+ old_period = sysctl_sched_deadline_period;
+ old_runtime = sysctl_sched_deadline_runtime;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (!ret && write) {
+ if (!sched_deadline_global_constraints()) {
+ sysctl_sched_deadline_period = old_period;
+ sysctl_sched_deadline_runtime = old_runtime;
+ ret = -EINVAL;
+ }
+ }
+ mutex_unlock(&deadline_constraints_mutex);
+
+ return ret;
+}
+
#ifdef CONFIG_CGROUP_SCHED

/* return corresponding task_group object of a cgroup */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0d949c5..34117f9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -373,6 +373,22 @@ static struct ctl_table kern_table[] = {
},
{
.ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_deadline_period_us",
+ .data = &sysctl_sched_deadline_period,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = &sched_deadline_handler,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "sched_deadline_runtime_us",
+ .data = &sysctl_sched_deadline_runtime,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &sched_deadline_handler,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
.procname = "sched_compat_yield",
.data = &sysctl_sched_compat_yield,
.maxlen = sizeof(unsigned int),
--
1.6.0.4

--
<<This happens because I choose it to happen!>> (Raistlin Majere)
----------------------------------------------------------------------
Dario Faggioli, ReTiS Lab, Scuola Superiore Sant'Anna, Pisa (Italy)

http://blog.linux.it/raistlin / raistlin@xxxxxxxxx /
dario.faggioli@xxxxxxxxxx

Attachment: signature.asc
Description: This is a digitally signed message part

Next message: Christopher Li: "Sparse 0.4.2 released"
Previous message: Raistlin: "[RFC 8/12][PATCH] SCHED_DEADLINE: wait next instance syscall added."
In reply to: Raistlin: "[RFC 8/12][PATCH] SCHED_DEADLINE: wait next instance syscall added."
Next in thread: Raistlin: "[RFC 11/12][PATCH] SCHED_DEADLINE: documentation"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]