Pull acpi_os_free into release branch

[~andy/linux] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 1847a4456a2dd9df3a0eba710e4f7f4fedddd22c..d5e37072ea54cb21af2b86925daf07677c27e920 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -354,12 +354,31 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
  }
  #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
  
+/*
+ * __task_rq_lock - lock the runqueue a given task resides on.
+ * Must be called interrupts disabled.
+ */
+static inline runqueue_t *__task_rq_lock(task_t *p)
+       __acquires(rq->lock)
+{
+       struct runqueue *rq;
+
+repeat_lock_task:
+       rq = task_rq(p);
+       spin_lock(&rq->lock);
+       if (unlikely(rq != task_rq(p))) {
+               spin_unlock(&rq->lock);
+               goto repeat_lock_task;
+       }
+       return rq;
+}
+
  /*
   * task_rq_lock - lock the runqueue a given task resides on and disable
   * interrupts.  Note the ordering: we can safely lookup the task_rq without
   * explicitly disabling preemption.
   */
-static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
+static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
         __acquires(rq->lock)
  {
         struct runqueue *rq;
@@ -375,6 +394,12 @@ repeat_lock_task:
         return rq;
  }
  
+static inline void __task_rq_unlock(runqueue_t *rq)
+       __releases(rq->lock)
+{
+       spin_unlock(&rq->lock);
+}
+
  static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
         __releases(rq->lock)
  {
@@ -638,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
  }
  
  /*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
   * priority but is modified by bonuses/penalties.
   *
   * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -651,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
   *
   * Both properties are important to certain workloads.
   */
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(task_t *p)
  {
         int bonus, prio;
  
-       if (rt_task(p))
-               return p->prio;
-
         bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
  
         prio = p->static_prio - bonus;
@@ -692,7 +715,7 @@ static int effective_prio(task_t *p)
  
  static void set_load_weight(task_t *p)
  {
-       if (rt_task(p)) {
+       if (has_rt_policy(p)) {
  #ifdef CONFIG_SMP
                 if (p == task_rq(p)->migration_thread)
                         /*
@@ -730,6 +753,44 @@ static inline void dec_nr_running(task_t *p, runqueue_t *rq)
         dec_raw_weighted_load(rq, p);
  }
  
+/*
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
+ */
+static inline int normal_prio(task_t *p)
+{
+       int prio;
+
+       if (has_rt_policy(p))
+               prio = MAX_RT_PRIO-1 - p->rt_priority;
+       else
+               prio = __normal_prio(p);
+       return prio;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(task_t *p)
+{
+       p->normal_prio = normal_prio(p);
+       /*
+        * If we are RT tasks or we were boosted to RT priority,
+        * keep the priority unchanged. Otherwise, update priority
+        * to the normal priority:
+        */
+       if (!rt_prio(p->prio))
+               return p->normal_prio;
+       return p->prio;
+}
+
  /*
   * __activate_task - move a task to the runqueue.
   */
@@ -752,6 +813,10 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
         inc_nr_running(p, rq);
  }
  
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
  static int recalc_task_prio(task_t *p, unsigned long long now)
  {
         /* Caller must always ensure 'now >= p->timestamp' */
@@ -1162,6 +1227,11 @@ static int sched_balance_self(int cpu, int flag)
         struct sched_domain *tmp, *sd = NULL;
  
         for_each_domain(cpu, tmp) {
+               /*
+                * If power savings logic is enabled for a domain, stop there.
+                */
+               if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+                       break;
                 if (tmp->flags & flag)
                         sd = tmp;
         }
@@ -1443,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
          * event cannot wake it up and insert it on the runqueue either.
          */
         p->state = TASK_RUNNING;
+
+       /*
+        * Make sure we do not leak PI boosting priority to the child:
+        */
+       p->prio = current->normal_prio;
+
         INIT_LIST_HEAD(&p->run_list);
         p->array = NULL;
  #ifdef CONFIG_SCHEDSTATS
@@ -1522,6 +1598,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
                                 __activate_task(p, rq);
                         else {
                                 p->prio = current->prio;
+                               p->normal_prio = current->normal_prio;
                                 list_add_tail(&p->run_list, &current->run_list);
                                 p->array = current->array;
                                 p->array->nr_active++;
@@ -1941,6 +2018,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
         return 1;
  }
  
+#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
  /*
   * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
   * load from busiest to this_rq, as part of a balancing operation within
@@ -1955,7 +2033,9 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
  {
         prio_array_t *array, *dst_array;
         struct list_head *head, *curr;
-       int idx, pulled = 0, pinned = 0;
+       int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
+       int busiest_best_prio_seen;
+       int skip_for_load; /* skip the task based on weighted load issues */
         long rem_load_move;
         task_t *tmp;
  
@@ -1964,6 +2044,16 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
  
         rem_load_move = max_load_move;
         pinned = 1;
+       this_best_prio = rq_best_prio(this_rq);
+       busiest_best_prio = rq_best_prio(busiest);
+       /*
+        * Enable handling of the case where there is more than one task
+        * with the best priority.   If the current running task is one
+        * of those with prio==busiest_best_prio we know it won't be moved
+        * and therefore it's safe to override the skip (based on load) of
+        * any task we find with that prio.
+        */
+       busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
  
         /*
          * We first consider expired tasks. Those will likely not be
@@ -2003,8 +2093,17 @@ skip_queue:
  
         curr = curr->prev;
  
-       if (tmp->load_weight > rem_load_move ||
+       /*
+        * To help distribute high priority tasks accross CPUs we don't
+        * skip a task if it will be the highest priority task (i.e. smallest
+        * prio value) on its new queue regardless of its load weight
+        */
+       skip_for_load = tmp->load_weight > rem_load_move;
+       if (skip_for_load && idx < this_best_prio)
+               skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
+       if (skip_for_load ||
             !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
+               busiest_best_prio_seen |= idx == busiest_best_prio;
                 if (curr != head)
                         goto skip_queue;
                 idx++;
@@ -2025,6 +2124,8 @@ skip_queue:
          * and the prescribed amount of weighted load.
          */
         if (pulled < max_nr_move && rem_load_move > 0) {
+               if (idx < this_best_prio)
+                       this_best_prio = idx;
                 if (curr != head)
                         goto skip_queue;
                 idx++;
@@ -2058,6 +2159,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         unsigned long busiest_load_per_task, busiest_nr_running;
         unsigned long this_load_per_task, this_nr_running;
         int load_idx;
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       int power_savings_balance = 1;
+       unsigned long leader_nr_running = 0, min_load_per_task = 0;
+       unsigned long min_nr_running = ULONG_MAX;
+       struct sched_group *group_min = NULL, *group_leader = NULL;
+#endif
  
         max_load = this_load = total_load = total_pwr = 0;
         busiest_load_per_task = busiest_nr_running = 0;
@@ -2070,7 +2177,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 load_idx = sd->idle_idx;
  
         do {
-               unsigned long load;
+               unsigned long load, group_capacity;
                 int local_group;
                 int i;
                 unsigned long sum_nr_running, sum_weighted_load;
@@ -2103,18 +2210,76 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 /* Adjust by relative CPU power of the group */
                 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
  
+               group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+
                 if (local_group) {
                         this_load = avg_load;
                         this = group;
                         this_nr_running = sum_nr_running;
                         this_load_per_task = sum_weighted_load;
                 } else if (avg_load > max_load &&
-                          sum_nr_running > group->cpu_power / SCHED_LOAD_SCALE) {
+                          sum_nr_running > group_capacity) {
                         max_load = avg_load;
                         busiest = group;
                         busiest_nr_running = sum_nr_running;
                         busiest_load_per_task = sum_weighted_load;
                 }
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+               /*
+                * Busy processors will not participate in power savings
+                * balance.
+                */
+               if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                       goto group_next;
+
+               /*
+                * If the local group is idle or completely loaded
+                * no need to do power savings balance at this domain
+                */
+               if (local_group && (this_nr_running >= group_capacity ||
+                                   !this_nr_running))
+                       power_savings_balance = 0;
+
+               /*
+                * If a group is already running at full capacity or idle,
+                * don't include that group in power savings calculations
+                */
+               if (!power_savings_balance || sum_nr_running >= group_capacity
+                   || !sum_nr_running)
+                       goto group_next;
+
+               /*
+                * Calculate the group which has the least non-idle load.
+                * This is the group from where we need to pick up the load
+                * for saving power
+                */
+               if ((sum_nr_running < min_nr_running) ||
+                   (sum_nr_running == min_nr_running &&
+                    first_cpu(group->cpumask) <
+                    first_cpu(group_min->cpumask))) {
+                       group_min = group;
+                       min_nr_running = sum_nr_running;
+                       min_load_per_task = sum_weighted_load /
+                                               sum_nr_running;
+               }
+
+               /*
+                * Calculate the group which is almost near its
+                * capacity but still has some space to pick up some load
+                * from other group and save more power
+                */
+               if (sum_nr_running <= group_capacity - 1)
+                       if (sum_nr_running > leader_nr_running ||
+                           (sum_nr_running == leader_nr_running &&
+                            first_cpu(group->cpumask) >
+                             first_cpu(group_leader->cpumask))) {
+                               group_leader = group;
+                               leader_nr_running = sum_nr_running;
+                       }
+
+group_next:
+#endif
                 group = group->next;
         } while (group != sd->groups);
  
@@ -2223,7 +2388,16 @@ small_imbalance:
         return busiest;
  
  out_balanced:
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+               goto ret;
  
+       if (this == group_leader && group_leader != group_min) {
+               *imbalance = min_load_per_task;
+               return group_min;
+       }
+ret:
+#endif
         *imbalance = 0;
         return NULL;
  }
@@ -2276,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
         int active_balance = 0;
         int sd_idle = 0;
  
-       if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+       if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
+           !sched_smt_power_savings)
                 sd_idle = 1;
  
         schedstat_inc(sd, lb_cnt[idle]);
@@ -2365,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                         sd->balance_interval *= 2;
         }
  
-       if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+       if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+           !sched_smt_power_savings)
                 return -1;
         return nr_moved;
  
@@ -2380,7 +2556,7 @@ out_one_pinned:
                         (sd->balance_interval < sd->max_interval))
                 sd->balance_interval *= 2;
  
-       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                 return -1;
         return 0;
  }
@@ -2401,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
         int nr_moved = 0;
         int sd_idle = 0;
  
-       if (sd->flags & SD_SHARE_CPUPOWER)
+       if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                 sd_idle = 1;
  
         schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2442,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
  
  out_balanced:
         schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
-       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
                 return -1;
         sd->nr_balance_failed = 0;
         return 0;
@@ -3564,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
  
  EXPORT_SYMBOL(sleep_on_timeout);
  
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance logic.
+ */
+void rt_mutex_setprio(task_t *p, int prio)
+{
+       unsigned long flags;
+       prio_array_t *array;
+       runqueue_t *rq;
+       int oldprio;
+
+       BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+       rq = task_rq_lock(p, &flags);
+
+       oldprio = p->prio;
+       array = p->array;
+       if (array)
+               dequeue_task(p, array);
+       p->prio = prio;
+
+       if (array) {
+               /*
+                * If changing to an RT priority then queue it
+                * in the active array!
+                */
+               if (rt_task(p))
+                       array = rq->active;
+               enqueue_task(p, array);
+               /*
+                * Reschedule if we are currently running on this runqueue and
+                * our priority decreased, or if we are not currently running on
+                * this runqueue and our priority is higher than the current's
+                */
+               if (task_running(rq, p)) {
+                       if (p->prio > oldprio)
+                               resched_task(rq->curr);
+               } else if (TASK_PREEMPTS_CURR(p, rq))
+                       resched_task(rq->curr);
+       }
+       task_rq_unlock(rq, &flags);
+}
+
+#endif
+
  void set_user_nice(task_t *p, long nice)
  {
         unsigned long flags;
         prio_array_t *array;
         runqueue_t *rq;
-       int old_prio, new_prio, delta;
+       int old_prio, delta;
  
         if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
                 return;
@@ -3584,7 +3813,7 @@ void set_user_nice(task_t *p, long nice)
          * it wont have any effect on scheduling until the task is
          * not SCHED_NORMAL/SCHED_BATCH:
          */
-       if (rt_task(p)) {
+       if (has_rt_policy(p)) {
                 p->static_prio = NICE_TO_PRIO(nice);
                 goto out_unlock;
         }
@@ -3594,12 +3823,11 @@ void set_user_nice(task_t *p, long nice)
                 dec_raw_weighted_load(rq, p);
         }
  
-       old_prio = p->prio;
-       new_prio = NICE_TO_PRIO(nice);
-       delta = new_prio - old_prio;
         p->static_prio = NICE_TO_PRIO(nice);
         set_load_weight(p);
-       p->prio += delta;
+       old_prio = p->prio;
+       p->prio = effective_prio(p);
+       delta = p->prio - old_prio;
  
         if (array) {
                 enqueue_task(p, array);
@@ -3614,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
  out_unlock:
         task_rq_unlock(rq, &flags);
  }
-
  EXPORT_SYMBOL(set_user_nice);
  
  /*
@@ -3729,16 +3956,14 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
         BUG_ON(p->array);
         p->policy = policy;
         p->rt_priority = prio;
-       if (policy != SCHED_NORMAL && policy != SCHED_BATCH) {
-               p->prio = MAX_RT_PRIO-1 - p->rt_priority;
-       } else {
-               p->prio = p->static_prio;
-               /*
-                * SCHED_BATCH tasks are treated as perpetual CPU hogs:
-                */
-               if (policy == SCHED_BATCH)
-                       p->sleep_avg = 0;
-       }
+       p->normal_prio = normal_prio(p);
+       /* we are holding p->pi_lock already */
+       p->prio = rt_mutex_getprio(p);
+       /*
+        * SCHED_BATCH tasks are treated as perpetual CPU hogs:
+        */
+       if (policy == SCHED_BATCH)
+               p->sleep_avg = 0;
         set_load_weight(p);
  }
  
@@ -3758,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy,
         unsigned long flags;
         runqueue_t *rq;
  
+       /* may grab non-irq protected spin_locks */
+       BUG_ON(in_interrupt());
  recheck:
         /* double check policy once rq lock held */
         if (policy < 0)
@@ -3805,15 +4032,21 @@ recheck:
         retval = security_task_setscheduler(p, policy, param);
         if (retval)
                 return retval;
+       /*
+        * make sure no PI-waiters arrive (or leave) while we are
+        * changing the priority of the task:
+        */
+       spin_lock_irqsave(&p->pi_lock, flags);
         /*
          * To be able to change p->policy safely, the apropriate
          * runqueue lock must be held.
          */
-       rq = task_rq_lock(p, &flags);
+       rq = __task_rq_lock(p);
         /* recheck policy now with rq lock held */
         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                 policy = oldpolicy = -1;
-               task_rq_unlock(rq, &flags);
+               __task_rq_unlock(rq);
+               spin_unlock_irqrestore(&p->pi_lock, flags);
                 goto recheck;
         }
         array = p->array;
@@ -3834,7 +4067,11 @@ recheck:
                 } else if (TASK_PREEMPTS_CURR(p, rq))
                         resched_task(rq->curr);
         }
-       task_rq_unlock(rq, &flags);
+       __task_rq_unlock(rq);
+       spin_unlock_irqrestore(&p->pi_lock, flags);
+
+       rt_mutex_adjust_pi(p);
+
         return 0;
  }
  EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3856,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
                 read_unlock_irq(&tasklist_lock);
                 return -ESRCH;
         }
-       retval = sched_setscheduler(p, policy, &lparam);
+       get_task_struct(p);
         read_unlock_irq(&tasklist_lock);
+       retval = sched_setscheduler(p, policy, &lparam);
+       put_task_struct(p);
         return retval;
  }
  
@@ -4147,7 +4386,16 @@ asmlinkage long sys_sched_yield(void)
         return 0;
  }
  
-static inline void __cond_resched(void)
+static inline int __resched_legal(void)
+{
+       if (unlikely(preempt_count()))
+               return 0;
+       if (unlikely(system_state != SYSTEM_RUNNING))
+               return 0;
+       return 1;
+}
+
+static void __cond_resched(void)
  {
  #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
         __might_sleep(__FILE__, __LINE__);
@@ -4157,10 +4405,6 @@ static inline void __cond_resched(void)
          * PREEMPT_ACTIVE, which could trigger a second
          * cond_resched() call.
          */
-       if (unlikely(preempt_count()))
-               return;
-       if (unlikely(system_state != SYSTEM_RUNNING))
-               return;
         do {
                 add_preempt_count(PREEMPT_ACTIVE);
                 schedule();
@@ -4170,13 +4414,12 @@ static inline void __cond_resched(void)
  
  int __sched cond_resched(void)
  {
-       if (need_resched()) {
+       if (need_resched() && __resched_legal()) {
                 __cond_resched();
                 return 1;
         }
         return 0;
  }
-
  EXPORT_SYMBOL(cond_resched);
  
  /*
@@ -4197,7 +4440,7 @@ int cond_resched_lock(spinlock_t *lock)
                 ret = 1;
                 spin_lock(lock);
         }
-       if (need_resched()) {
+       if (need_resched() && __resched_legal()) {
                 _raw_spin_unlock(lock);
                 preempt_enable_no_resched();
                 __cond_resched();
@@ -4206,14 +4449,13 @@ int cond_resched_lock(spinlock_t *lock)
         }
         return ret;
  }
-
  EXPORT_SYMBOL(cond_resched_lock);
  
  int __sched cond_resched_softirq(void)
  {
         BUG_ON(!in_softirq());
  
-       if (need_resched()) {
+       if (need_resched() && __resched_legal()) {
                 __local_bh_enable();
                 __cond_resched();
                 local_bh_disable();
@@ -4221,10 +4463,8 @@ int __sched cond_resched_softirq(void)
         }
         return 0;
  }
-
  EXPORT_SYMBOL(cond_resched_softirq);
  
-
  /**
   * yield - yield the current processor to other threads.
   *
@@ -4469,7 +4709,7 @@ void __devinit init_idle(task_t *idle, int cpu)
         idle->timestamp = sched_clock();
         idle->sleep_avg = 0;
         idle->array = NULL;
-       idle->prio = MAX_PRIO;
+       idle->prio = idle->normal_prio = MAX_PRIO;
         idle->state = TASK_RUNNING;
         idle->cpus_allowed = cpumask_of_cpu(cpu);
         set_task_cpu(idle, cpu);
@@ -5708,6 +5948,7 @@ static cpumask_t sched_domain_node_span(int node)
  }
  #endif
  
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  /*
   * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
   * can switch it on easily if needed.
@@ -5723,7 +5964,7 @@ static int cpu_to_cpu_group(int cpu)
  
  #ifdef CONFIG_SCHED_MC
  static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group sched_group_core[NR_CPUS];
+static struct sched_group *sched_group_core_bycpu[NR_CPUS];
  #endif
  
  #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5739,7 +5980,7 @@ static int cpu_to_core_group(int cpu)
  #endif
  
  static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
+static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
  static int cpu_to_phys_group(int cpu)
  {
  #if defined(CONFIG_SCHED_MC)
@@ -5796,13 +6037,74 @@ next_sg:
  }
  #endif
  
+/* Free memory allocated for various sched_group structures */
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+       int cpu;
+#ifdef CONFIG_NUMA
+       int i;
+
+       for_each_cpu_mask(cpu, *cpu_map) {
+               struct sched_group *sched_group_allnodes
+                       = sched_group_allnodes_bycpu[cpu];
+               struct sched_group **sched_group_nodes
+                       = sched_group_nodes_bycpu[cpu];
+
+               if (sched_group_allnodes) {
+                       kfree(sched_group_allnodes);
+                       sched_group_allnodes_bycpu[cpu] = NULL;
+               }
+
+               if (!sched_group_nodes)
+                       continue;
+
+               for (i = 0; i < MAX_NUMNODES; i++) {
+                       cpumask_t nodemask = node_to_cpumask(i);
+                       struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+                       cpus_and(nodemask, nodemask, *cpu_map);
+                       if (cpus_empty(nodemask))
+                               continue;
+
+                       if (sg == NULL)
+                               continue;
+                       sg = sg->next;
+next_sg:
+                       oldsg = sg;
+                       sg = sg->next;
+                       kfree(oldsg);
+                       if (oldsg != sched_group_nodes[i])
+                               goto next_sg;
+               }
+               kfree(sched_group_nodes);
+               sched_group_nodes_bycpu[cpu] = NULL;
+       }
+#endif
+       for_each_cpu_mask(cpu, *cpu_map) {
+               if (sched_group_phys_bycpu[cpu]) {
+                       kfree(sched_group_phys_bycpu[cpu]);
+                       sched_group_phys_bycpu[cpu] = NULL;
+               }
+#ifdef CONFIG_SCHED_MC
+               if (sched_group_core_bycpu[cpu]) {
+                       kfree(sched_group_core_bycpu[cpu]);
+                       sched_group_core_bycpu[cpu] = NULL;
+               }
+#endif
+       }
+}
+
  /*
   * Build sched domains for a given set of cpus and attach the sched domains
   * to the individual cpus
   */
-void build_sched_domains(const cpumask_t *cpu_map)
+static int build_sched_domains(const cpumask_t *cpu_map)
  {
         int i;
+       struct sched_group *sched_group_phys = NULL;
+#ifdef CONFIG_SCHED_MC
+       struct sched_group *sched_group_core = NULL;
+#endif
  #ifdef CONFIG_NUMA
         struct sched_group **sched_group_nodes = NULL;
         struct sched_group *sched_group_allnodes = NULL;
@@ -5810,11 +6112,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
         /*
          * Allocate the per-node list of sched groups
          */
-       sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
-                                          GFP_ATOMIC);
+       sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+                                          GFP_KERNEL);
         if (!sched_group_nodes) {
                 printk(KERN_WARNING "Can not alloc sched group node list\n");
-               return;
+               return -ENOMEM;
         }
         sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
  #endif
@@ -5840,7 +6142,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
                                 if (!sched_group_allnodes) {
                                         printk(KERN_WARNING
                                         "Can not alloc allnodes sched group\n");
-                                       break;
+                                       goto error;
                                 }
                                 sched_group_allnodes_bycpu[i]
                                                 = sched_group_allnodes;
@@ -5861,6 +6163,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
                 cpus_and(sd->span, sd->span, *cpu_map);
  #endif
  
+               if (!sched_group_phys) {
+                       sched_group_phys
+                               = kmalloc(sizeof(struct sched_group) * NR_CPUS,
+                                         GFP_KERNEL);
+                       if (!sched_group_phys) {
+                               printk (KERN_WARNING "Can not alloc phys sched"
+                                                    "group\n");
+                               goto error;
+                       }
+                       sched_group_phys_bycpu[i] = sched_group_phys;
+               }
+
                 p = sd;
                 sd = &per_cpu(phys_domains, i);
                 group = cpu_to_phys_group(i);
@@ -5870,6 +6184,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
                 sd->groups = &sched_group_phys[group];
  
  #ifdef CONFIG_SCHED_MC
+               if (!sched_group_core) {
+                       sched_group_core
+                               = kmalloc(sizeof(struct sched_group) * NR_CPUS,
+                                         GFP_KERNEL);
+                       if (!sched_group_core) {
+                               printk (KERN_WARNING "Can not alloc core sched"
+                                                    "group\n");
+                               goto error;
+                       }
+                       sched_group_core_bycpu[i] = sched_group_core;
+               }
+
                 p = sd;
                 sd = &per_cpu(core_domains, i);
                 group = cpu_to_core_group(i);
@@ -5953,24 +6279,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
                 domainspan = sched_domain_node_span(i);
                 cpus_and(domainspan, domainspan, *cpu_map);
  
-               sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+               if (!sg) {
+                       printk(KERN_WARNING "Can not alloc domain group for "
+                               "node %d\n", i);
+                       goto error;
+               }
                 sched_group_nodes[i] = sg;
                 for_each_cpu_mask(j, nodemask) {
                         struct sched_domain *sd;
                         sd = &per_cpu(node_domains, j);
                         sd->groups = sg;
-                       if (sd->groups == NULL) {
-                               /* Turn off balancing if we have no groups */
-                               sd->flags = 0;
-                       }
-               }
-               if (!sg) {
-                       printk(KERN_WARNING
-                       "Can not alloc domain group for node %d\n", i);
-                       continue;
                 }
                 sg->cpu_power = 0;
                 sg->cpumask = nodemask;
+               sg->next = sg;
                 cpus_or(covered, covered, nodemask);
                 prev = sg;
  
@@ -5989,54 +6312,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
                         if (cpus_empty(tmp))
                                 continue;
  
-                       sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+                       sg = kmalloc_node(sizeof(struct sched_group),
+                                         GFP_KERNEL, i);
                         if (!sg) {
                                 printk(KERN_WARNING
                                 "Can not alloc domain group for node %d\n", j);
-                               break;
+                               goto error;
                         }
                         sg->cpu_power = 0;
                         sg->cpumask = tmp;
+                       sg->next = prev->next;
                         cpus_or(covered, covered, tmp);
                         prev->next = sg;
                         prev = sg;
                 }
-               prev->next = sched_group_nodes[i];
         }
  #endif
  
         /* Calculate CPU power for physical packages and nodes */
+#ifdef CONFIG_SCHED_SMT
         for_each_cpu_mask(i, *cpu_map) {
-               int power;
                 struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
                 sd = &per_cpu(cpu_domains, i);
-               power = SCHED_LOAD_SCALE;
-               sd->groups->cpu_power = power;
+               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+       }
  #endif
  #ifdef CONFIG_SCHED_MC
+       for_each_cpu_mask(i, *cpu_map) {
+               int power;
+               struct sched_domain *sd;
                 sd = &per_cpu(core_domains, i);
-               power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+               if (sched_smt_power_savings)
+                       power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+               else
+                       power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
                                             * SCHED_LOAD_SCALE / 10;
                 sd->groups->cpu_power = power;
+       }
+#endif
  
+       for_each_cpu_mask(i, *cpu_map) {
+               struct sched_domain *sd;
+#ifdef CONFIG_SCHED_MC
                 sd = &per_cpu(phys_domains, i);
+               if (i != first_cpu(sd->groups->cpumask))
+                       continue;
  
-               /*
-                * This has to be < 2 * SCHED_LOAD_SCALE
-                * Lets keep it SCHED_LOAD_SCALE, so that
-                * while calculating NUMA group's cpu_power
-                * we can simply do
-                *  numa_group->cpu_power += phys_group->cpu_power;
-                *
-                * See "only add power once for each physical pkg"
-                * comment below
-                */
-               sd->groups->cpu_power = SCHED_LOAD_SCALE;
+               sd->groups->cpu_power = 0;
+               if (sched_mc_power_savings || sched_smt_power_savings) {
+                       int j;
+
+                       for_each_cpu_mask(j, sd->groups->cpumask) {
+                               struct sched_domain *sd1;
+                               sd1 = &per_cpu(core_domains, j);
+                               /*
+                                * for each core we will add once
+                                * to the group in physical domain
+                                */
+                               if (j != first_cpu(sd1->groups->cpumask))
+                                       continue;
+
+                               if (sched_smt_power_savings)
+                                       sd->groups->cpu_power += sd1->groups->cpu_power;
+                               else
+                                       sd->groups->cpu_power += SCHED_LOAD_SCALE;
+                       }
+               } else
+                       /*
+                        * This has to be < 2 * SCHED_LOAD_SCALE
+                        * Lets keep it SCHED_LOAD_SCALE, so that
+                        * while calculating NUMA group's cpu_power
+                        * we can simply do
+                        *  numa_group->cpu_power += phys_group->cpu_power;
+                        *
+                        * See "only add power once for each physical pkg"
+                        * comment below
+                        */
+                       sd->groups->cpu_power = SCHED_LOAD_SCALE;
  #else
+               int power;
                 sd = &per_cpu(phys_domains, i);
-               power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-                               (cpus_weight(sd->groups->cpumask)-1) / 10;
+               if (sched_smt_power_savings)
+                       power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
+               else
+                       power = SCHED_LOAD_SCALE;
                 sd->groups->cpu_power = power;
  #endif
         }
@@ -6064,13 +6423,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
          * Tune cache-hot values:
          */
         calibrate_migration_costs(cpu_map);
+
+       return 0;
+
+error:
+       free_sched_groups(cpu_map);
+       return -ENOMEM;
  }
  /*
   * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
   */
-static void arch_init_sched_domains(const cpumask_t *cpu_map)
+static int arch_init_sched_domains(const cpumask_t *cpu_map)
  {
         cpumask_t cpu_default_map;
+       int err;
  
         /*
          * Setup mask for cpus without special case scheduling requirements.
@@ -6079,51 +6445,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
          */
         cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
  
-       build_sched_domains(&cpu_default_map);
+       err = build_sched_domains(&cpu_default_map);
+
+       return err;
  }
  
  static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
  {
-#ifdef CONFIG_NUMA
-       int i;
-       int cpu;
-
-       for_each_cpu_mask(cpu, *cpu_map) {
-               struct sched_group *sched_group_allnodes
-                       = sched_group_allnodes_bycpu[cpu];
-               struct sched_group **sched_group_nodes
-                       = sched_group_nodes_bycpu[cpu];
-
-               if (sched_group_allnodes) {
-                       kfree(sched_group_allnodes);
-                       sched_group_allnodes_bycpu[cpu] = NULL;
-               }
-
-               if (!sched_group_nodes)
-                       continue;
-
-               for (i = 0; i < MAX_NUMNODES; i++) {
-                       cpumask_t nodemask = node_to_cpumask(i);
-                       struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
-                       cpus_and(nodemask, nodemask, *cpu_map);
-                       if (cpus_empty(nodemask))
-                               continue;
-
-                       if (sg == NULL)
-                               continue;
-                       sg = sg->next;
-next_sg:
-                       oldsg = sg;
-                       sg = sg->next;
-                       kfree(oldsg);
-                       if (oldsg != sched_group_nodes[i])
-                               goto next_sg;
-               }
-               kfree(sched_group_nodes);
-               sched_group_nodes_bycpu[cpu] = NULL;
-       }
-#endif
+       free_sched_groups(cpu_map);
  }
  
  /*
@@ -6148,9 +6477,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
   * correct sched domains
   * Call with hotplug lock held
   */
-void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
  {
         cpumask_t change_map;
+       int err = 0;
  
         cpus_and(*partition1, *partition1, cpu_online_map);
         cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6159,10 +6489,86 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
         /* Detach sched domains from all of the affected cpus */
         detach_destroy_domains(&change_map);
         if (!cpus_empty(*partition1))
-               build_sched_domains(partition1);
-       if (!cpus_empty(*partition2))
-               build_sched_domains(partition2);
+               err = build_sched_domains(partition1);
+       if (!err && !cpus_empty(*partition2))
+               err = build_sched_domains(partition2);
+
+       return err;
+}
+
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+int arch_reinit_sched_domains(void)
+{
+       int err;
+
+       lock_cpu_hotplug();
+       detach_destroy_domains(&cpu_online_map);
+       err = arch_init_sched_domains(&cpu_online_map);
+       unlock_cpu_hotplug();
+
+       return err;
+}
+
+static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
+{
+       int ret;
+
+       if (buf[0] != '0' && buf[0] != '1')
+               return -EINVAL;
+
+       if (smt)
+               sched_smt_power_savings = (buf[0] == '1');
+       else
+               sched_mc_power_savings = (buf[0] == '1');
+
+       ret = arch_reinit_sched_domains();
+
+       return ret ? ret : count;
+}
+
+int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+{
+       int err = 0;
+#ifdef CONFIG_SCHED_SMT
+       if (smt_capable())
+               err = sysfs_create_file(&cls->kset.kobj,
+                                       &attr_sched_smt_power_savings.attr);
+#endif
+#ifdef CONFIG_SCHED_MC
+       if (!err && mc_capable())
+               err = sysfs_create_file(&cls->kset.kobj,
+                                       &attr_sched_mc_power_savings.attr);
+#endif
+       return err;
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
+{
+       return sprintf(page, "%u\n", sched_mc_power_savings);
+}
+static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+       return sched_power_savings_store(buf, count, 0);
+}
+SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
+           sched_mc_power_savings_store);
+#endif
+
+#ifdef CONFIG_SCHED_SMT
+static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
+{
+       return sprintf(page, "%u\n", sched_smt_power_savings);
+}
+static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
+{
+       return sched_power_savings_store(buf, count, 1);
  }
+SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
+           sched_smt_power_savings_store);
+#endif
+
  
  #ifdef CONFIG_HOTPLUG_CPU
  /*
@@ -6310,7 +6716,8 @@ void normalize_rt_tasks(void)
                 if (!rt_task(p))
                         continue;
  
-               rq = task_rq_lock(p, &flags);
+               spin_lock_irqsave(&p->pi_lock, flags);
+               rq = __task_rq_lock(p);
  
                 array = p->array;
                 if (array)
@@ -6321,7 +6728,8 @@ void normalize_rt_tasks(void)
                         resched_task(rq->curr);
                 }
  
-               task_rq_unlock(rq, &flags);
+               __task_rq_unlock(rq);
+               spin_unlock_irqrestore(&p->pi_lock, flags);
         }
         read_unlock_irq(&tasklist_lock);
  }