]> Pileus Git - ~andy/linux/blobdiff - kernel/sched.c
sched: fix sd_parent_degenerate on non-numa smp machine
[~andy/linux] / kernel / sched.c
index 9b1e79371c207b37c1617d3f7c0460709a3cc39b..74498c840f933e7a1c63dd7ffb8e8db270e2bc34 100644 (file)
@@ -261,6 +261,10 @@ struct task_group {
        struct cgroup_subsys_state css;
 #endif
 
+#ifdef CONFIG_USER_SCHED
+       uid_t uid;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* schedulable entities of this group on each cpu */
        struct sched_entity **se;
@@ -286,6 +290,12 @@ struct task_group {
 
 #ifdef CONFIG_USER_SCHED
 
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+       user->tg->uid = user->uid;
+}
+
 /*
  * Root task group.
  *     Every UID task group (including init_task_group aka UID-0) will
@@ -703,45 +713,18 @@ static __read_mostly char *sched_feat_names[] = {
 
 #undef SCHED_FEAT
 
-static int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_show(struct seq_file *m, void *v)
 {
-       filp->private_data = inode->i_private;
-       return 0;
-}
-
-static ssize_t
-sched_feat_read(struct file *filp, char __user *ubuf,
-               size_t cnt, loff_t *ppos)
-{
-       char *buf;
-       int r = 0;
-       int len = 0;
        int i;
 
        for (i = 0; sched_feat_names[i]; i++) {
-               len += strlen(sched_feat_names[i]);
-               len += 4;
-       }
-
-       buf = kmalloc(len + 2, GFP_KERNEL);
-       if (!buf)
-               return -ENOMEM;
-
-       for (i = 0; sched_feat_names[i]; i++) {
-               if (sysctl_sched_features & (1UL << i))
-                       r += sprintf(buf + r, "%s ", sched_feat_names[i]);
-               else
-                       r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
+               if (!(sysctl_sched_features & (1UL << i)))
+                       seq_puts(m, "NO_");
+               seq_printf(m, "%s ", sched_feat_names[i]);
        }
+       seq_puts(m, "\n");
 
-       r += sprintf(buf + r, "\n");
-       WARN_ON(r >= len + 2);
-
-       r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
-       kfree(buf);
-
-       return r;
+       return 0;
 }
 
 static ssize_t
@@ -786,10 +769,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        return cnt;
 }
 
+static int sched_feat_open(struct inode *inode, struct file *filp)
+{
+       return single_open(filp, sched_feat_show, NULL);
+}
+
 static struct file_operations sched_feat_fops = {
-       .open   = sched_feat_open,
-       .read   = sched_feat_read,
-       .write  = sched_feat_write,
+       .open           = sched_feat_open,
+       .write          = sched_feat_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
 };
 
 static __init int sched_init_debug(void)
@@ -1453,9 +1443,10 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
+       unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
 
-       if (rq->nr_running)
-               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+       if (nr_running)
+               rq->avg_load_per_task = rq->load.weight / nr_running;
        else
                rq->avg_load_per_task = 0;
 
@@ -1473,27 +1464,13 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
                        unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-       int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
 
        if (!tg->se[cpu])
                return;
 
-       rq_weight = tg->cfs_rq[cpu]->load.weight;
-
-       /*
-        * If there are currently no tasks on the cpu pretend there is one of
-        * average load so that when a new task gets to run here it will not
-        * get delayed by group starvation.
-        */
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-
-       if (unlikely(rq_weight > sd_rq_weight))
-               rq_weight = sd_rq_weight;
+       rq_weight = tg->cfs_rq[cpu]->rq_weight;
 
        /*
         *           \Sum shares * rq_weight
@@ -1501,7 +1478,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
         *               \Sum rq_weight
         *
         */
-       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       shares = (sd_shares * rq_weight) / sd_rq_weight;
        shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 
        if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1510,11 +1487,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                unsigned long flags;
 
                spin_lock_irqsave(&rq->lock, flags);
-               /*
-                * record the actual number of shares, not the boosted amount.
-                */
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               tg->cfs_rq[cpu]->rq_weight = rq_weight;
+               tg->cfs_rq[cpu]->shares = shares;
 
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
@@ -1528,13 +1501,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-       unsigned long rq_weight = 0;
+       unsigned long weight, rq_weight = 0;
        unsigned long shares = 0;
        struct sched_domain *sd = data;
        int i;
 
        for_each_cpu_mask(i, sd->span) {
-               rq_weight += tg->cfs_rq[i]->load.weight;
+               /*
+                * If there are currently no tasks on the cpu pretend there
+                * is one of average load so that when a new task gets to
+                * run here it will not get delayed by group starvation.
+                */
+               weight = tg->cfs_rq[i]->load.weight;
+               if (!weight)
+                       weight = NICE_0_LOAD;
+
+               tg->cfs_rq[i]->rq_weight = weight;
+               rq_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
 
@@ -1544,9 +1527,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
 
-       if (!rq_weight)
-               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-
        for_each_cpu_mask(i, sd->span)
                update_group_shares_cpu(tg, i, shares, rq_weight);
 
@@ -1611,6 +1591,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       int ret = 0;
+
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+       if (unlikely(!spin_trylock(&busiest->lock))) {
+               if (busiest < this_rq) {
+                       spin_unlock(&this_rq->lock);
+                       spin_lock(&busiest->lock);
+                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+                       ret = 1;
+               } else
+                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+       }
+       return ret;
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(busiest->lock)
+{
+       spin_unlock(&busiest->lock);
+       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -2810,40 +2823,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
                __release(rq2->lock);
 }
 
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(this_rq->lock)
-       __acquires(busiest->lock)
-       __acquires(this_rq->lock)
-{
-       int ret = 0;
-
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
-       if (unlikely(!spin_trylock(&busiest->lock))) {
-               if (busiest < this_rq) {
-                       spin_unlock(&this_rq->lock);
-                       spin_lock(&busiest->lock);
-                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-                       ret = 1;
-               } else
-                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-       }
-       return ret;
-}
-
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(busiest->lock)
-{
-       spin_unlock(&busiest->lock);
-       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -6125,7 +6104,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 
 /*
  * Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
  */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
@@ -6635,28 +6613,6 @@ early_initcall(migration_init);
 
 #ifdef CONFIG_SCHED_DEBUG
 
-static inline const char *sd_level_to_string(enum sched_domain_level lvl)
-{
-       switch (lvl) {
-       case SD_LV_NONE:
-                       return "NONE";
-       case SD_LV_SIBLING:
-                       return "SIBLING";
-       case SD_LV_MC:
-                       return "MC";
-       case SD_LV_CPU:
-                       return "CPU";
-       case SD_LV_NODE:
-                       return "NODE";
-       case SD_LV_ALLNODES:
-                       return "ALLNODES";
-       case SD_LV_MAX:
-                       return "MAX";
-
-       }
-       return "MAX";
-}
-
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  cpumask_t *groupmask)
 {
@@ -6676,8 +6632,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                return -1;
        }
 
-       printk(KERN_CONT "span %s level %s\n",
-               str, sd_level_to_string(sd->level));
+       printk(KERN_CONT "span %s level %s\n", str, sd->name);
 
        if (!cpu_isset(cpu, sd->span)) {
                printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6813,6 +6768,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_EXEC |
                                SD_SHARE_CPUPOWER |
                                SD_SHARE_PKG_RESOURCES);
+               if (nr_node_ids == 1)
+                       pflags &= ~SD_SERIALIZE;
        }
        if (~cflags & pflags)
                return 0;
@@ -7333,13 +7290,21 @@ struct allmasks {
 };
 
 #if    NR_CPUS > 128
-#define        SCHED_CPUMASK_ALLOC             1
-#define        SCHED_CPUMASK_FREE(v)           kfree(v)
-#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks *v
+#define SCHED_CPUMASK_DECLARE(v)       struct allmasks *v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{
+       *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
+}
+static inline void sched_cpumask_free(struct allmasks *masks)
+{
+       kfree(masks);
+}
 #else
-#define        SCHED_CPUMASK_ALLOC             0
-#define        SCHED_CPUMASK_FREE(v)
-#define        SCHED_CPUMASK_DECLARE(v)        struct allmasks _v, *v = &_v
+#define SCHED_CPUMASK_DECLARE(v)       struct allmasks _v, *v = &_v
+static inline void sched_cpumask_alloc(struct allmasks **masks)
+{ }
+static inline void sched_cpumask_free(struct allmasks *masks)
+{ }
 #endif
 
 #define        SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
@@ -7415,9 +7380,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                return -ENOMEM;
        }
 
-#if SCHED_CPUMASK_ALLOC
        /* get space for all scratch cpumask variables */
-       allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+       sched_cpumask_alloc(&allmasks);
        if (!allmasks) {
                printk(KERN_WARNING "Cannot alloc cpumask array\n");
                kfree(rd);
@@ -7426,7 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
                return -ENOMEM;
        }
-#endif
+
        tmpmask = (cpumask_t *)allmasks;
 
 
@@ -7680,13 +7644,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                cpu_attach_domain(sd, rd, i);
        }
 
-       SCHED_CPUMASK_FREE((void *)allmasks);
+       sched_cpumask_free(allmasks);
        return 0;
 
 #ifdef CONFIG_NUMA
 error:
        free_sched_groups(cpu_map, tmpmask);
-       SCHED_CPUMASK_FREE((void *)allmasks);
+       sched_cpumask_free(allmasks);
        kfree(rd);
        return -ENOMEM;
 #endif
@@ -7750,8 +7714,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
        cpumask_t tmpmask;
        int i;
 
-       unregister_sched_domain_sysctl();
-
        for_each_cpu_mask_nr(i, *cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
        synchronize_sched();
@@ -7829,7 +7791,7 @@ match1:
                ndoms_cur = 0;
                doms_new = &fallback_doms;
                cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-               dattr_new = NULL;
+               WARN_ON_ONCE(dattr_new);
        }
 
        /* Build new domains */
@@ -8489,7 +8451,7 @@ static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct cfs_rq *cfs_rq;
-       struct sched_entity *se, *parent_se;
+       struct sched_entity *se;
        struct rq *rq;
        int i;
 
@@ -8505,18 +8467,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
 
-               cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+                                     GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
                        goto err;
 
-               se = kmalloc_node(sizeof(struct sched_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               se = kzalloc_node(sizeof(struct sched_entity),
+                                 GFP_KERNEL, cpu_to_node(i));
                if (!se)
                        goto err;
 
-               parent_se = parent ? parent->se[i] : NULL;
-               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
+               init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
        }
 
        return 1;
@@ -8577,7 +8538,7 @@ static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct rt_rq *rt_rq;
-       struct sched_rt_entity *rt_se, *parent_se;
+       struct sched_rt_entity *rt_se;
        struct rq *rq;
        int i;
 
@@ -8594,18 +8555,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
        for_each_possible_cpu(i) {
                rq = cpu_rq(i);
 
-               rt_rq = kmalloc_node(sizeof(struct rt_rq),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_rq = kzalloc_node(sizeof(struct rt_rq),
+                                    GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
                        goto err;
 
-               rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
-                               GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+               rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+                                    GFP_KERNEL, cpu_to_node(i));
                if (!rt_se)
                        goto err;
 
-               parent_se = parent ? parent->rt_se[i] : NULL;
-               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
+               init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
        }
 
        return 1;
@@ -9248,11 +9208,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
  * (balbir@in.ibm.com).
  */
 
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
        struct cgroup_subsys_state css;
        /* cpuusage holds pointer to a u64-type object on every cpu */
        u64 *cpuusage;
+       struct cpuacct *parent;
 };
 
 struct cgroup_subsys cpuacct_subsys;
@@ -9286,6 +9247,9 @@ static struct cgroup_subsys_state *cpuacct_create(
                return ERR_PTR(-ENOMEM);
        }
 
+       if (cgrp->parent)
+               ca->parent = cgroup_ca(cgrp->parent);
+
        return &ca->css;
 }
 
@@ -9365,14 +9329,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
+       int cpu;
 
        if (!cpuacct_subsys.active)
                return;
 
+       cpu = task_cpu(tsk);
        ca = task_ca(tsk);
-       if (ca) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
 
+       for (; ca; ca = ca->parent) {
+               u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
 }