X-Git-Url: http://pileus.org/git/?a=blobdiff_plain;f=kernel%2Fsched_fair.c;h=aa7f8412101658fc27e1487ded677acf695ac4d1;hb=6487a9d3b5476ffd1bbbe97e58cf26dbeb7a5d4a;hp=652e8bdef9aadb294b2e094a5d9620d95c26d873;hpb=7193bea53f9d9730bbc859777c2f86c76349914d;p=~andy%2Flinux diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 652e8bdef9a..aa7f8412101 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -24,7 +24,7 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -34,13 +34,13 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ -unsigned int sysctl_sched_latency = 20000000ULL; +unsigned int sysctl_sched_latency = 5000000ULL; /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 4000000ULL; +unsigned int sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity @@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL; static unsigned int sched_nr_latency = 5; /* - * After fork, child runs first. (default) If set to 0 then + * After fork, child runs first. If set to 0 (default) then * parent will (try to) run first. */ -const_debug unsigned int sysctl_sched_child_runs_first = 1; +unsigned int sysctl_sched_child_runs_first __read_mostly; /* * sys_sched_yield() compat mode @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_OTHER wake-up granularity. - * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 5000000UL; +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class; * CFS operations on generic schedulable entities: */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* cpu runqueue to which this cfs_rq is attached */ @@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) @@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { @@ -537,6 +545,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) schedstat_set(se->wait_count, se->wait_count + 1); schedstat_set(se->wait_sum, se->wait_sum + rq_of(cfs_rq)->clock - se->wait_start); +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + trace_sched_stat_wait(task_of(se), + rq_of(cfs_rq)->clock - se->wait_start); + } +#endif schedstat_set(se->wait_start, 0); } @@ -628,8 +642,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sleep_start = 0; se->sum_sleep_runtime += delta; - if (tsk) + if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; @@ -644,6 +660,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sum_sleep_runtime += delta; if (tsk) { + if (tsk->in_iowait) { + se->iowait_sum += delta; + se->iowait_count++; + trace_sched_stat_iowait(tsk, delta); + } + /* * Blocking time is in units of nanosecs, so shift by * 20 to get a milliseconds-range estimation of the @@ -705,11 +727,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime -= thresh; } - - /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); } + /* ensure we never gain time by being placed backwards. */ + vruntime = max_vruntime(se->vruntime, vruntime); + se->vruntime = vruntime; } @@ -1046,17 +1068,21 @@ static void yield_task_fair(struct rq *rq) * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_mask) + * hence we need to mask them out (rq->rd->online) * * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) + +#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) + static int wake_idle(int cpu, struct task_struct *p) { struct sched_domain *sd; int i; unsigned int chosen_wakeup_cpu; int this_cpu; + struct rq *task_rq = task_rq(p); /* * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu @@ -1089,10 +1115,10 @@ static int wake_idle(int cpu, struct task_struct *p) for_each_domain(cpu, sd) { if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq(p)->clock, sd))) { + && !task_hot(p, task_rq->clock, sd))) { for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (cpu_active(i) && idle_cpu(i)) { + if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); @@ -1235,7 +1261,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, tg = task_group(p); weight = p->se.load.weight; - balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + /* + * In low-load situations, where prev_cpu is idle and this_cpu is idle + * due to the sync cause above having dropped tl to 0, we'll always have + * an imbalance, but there's really nothing you can do about that, so + * that's good too. + * + * Otherwise check if either cpus are near enough in load to allow this + * task to be woken on this_cpu. + */ + balanced = !tl || + 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* @@ -1278,8 +1314,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync) this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; - if (prev_cpu == this_cpu) - goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: @@ -1721,6 +1755,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) sched_info_queued(p); update_curr(cfs_rq); + if (curr) + se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); /* 'curr' will be NULL if the child belongs to a different group */