Pileus Git - ~andy/linux/blob - kernel/sched/cputime.c

   1 #include <linux/export.h>
   2 #include <linux/sched.h>
   3 #include <linux/tsacct_kern.h>
   4 #include <linux/kernel_stat.h>
   5 #include <linux/static_key.h>
   6 #include "sched.h"
   7
   8
   9 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
  10
  11 /*
  12  * There are no locks covering percpu hardirq/softirq time.
  13  * They are only modified in vtime_account, on corresponding CPU
  14  * with interrupts disabled. So, writes are safe.
  15  * They are read and saved off onto struct rq in update_rq_clock().
  16  * This may result in other CPU reading this CPU's irq time and can
  17  * race with irq/vtime_account on this CPU. We would either get old
  18  * or new value with a side effect of accounting a slice of irq time to wrong
  19  * task when irq is in progress while we read rq->clock. That is a worthy
  20  * compromise in place of having locks on each irq in account_system_time.
  21  */
  22 DEFINE_PER_CPU(u64, cpu_hardirq_time);
  23 DEFINE_PER_CPU(u64, cpu_softirq_time);
  24
  25 static DEFINE_PER_CPU(u64, irq_start_time);
  26 static int sched_clock_irqtime;
  27
  28 void enable_sched_clock_irqtime(void)
  29 {
  30         sched_clock_irqtime = 1;
  31 }
  32
  33 void disable_sched_clock_irqtime(void)
  34 {
  35         sched_clock_irqtime = 0;
  36 }
  37
  38 #ifndef CONFIG_64BIT
  39 DEFINE_PER_CPU(seqcount_t, irq_time_seq);
  40 #endif /* CONFIG_64BIT */
  41
  42 /*
  43  * Called before incrementing preempt_count on {soft,}irq_enter
  44  * and before decrementing preempt_count on {soft,}irq_exit.
  45  */
  46 void vtime_account(struct task_struct *curr)
  47 {
  48         unsigned long flags;
  49         s64 delta;
  50         int cpu;
  51
  52         if (!sched_clock_irqtime)
  53                 return;
  54
  55         local_irq_save(flags);
  56
  57         cpu = smp_processor_id();
  58         delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
  59         __this_cpu_add(irq_start_time, delta);
  60
  61         irq_time_write_begin();
  62         /*
  63          * We do not account for softirq time from ksoftirqd here.
  64          * We want to continue accounting softirq time to ksoftirqd thread
  65          * in that case, so as not to confuse scheduler with a special task
  66          * that do not consume any time, but still wants to run.
  67          */
  68         if (hardirq_count())
  69                 __this_cpu_add(cpu_hardirq_time, delta);
  70         else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
  71                 __this_cpu_add(cpu_softirq_time, delta);
  72
  73         irq_time_write_end();
  74         local_irq_restore(flags);
  75 }
  76 EXPORT_SYMBOL_GPL(vtime_account);
  77
  78 static int irqtime_account_hi_update(void)
  79 {
  80         u64 *cpustat = kcpustat_this_cpu->cpustat;
  81         unsigned long flags;
  82         u64 latest_ns;
  83         int ret = 0;
  84
  85         local_irq_save(flags);
  86         latest_ns = this_cpu_read(cpu_hardirq_time);
  87         if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
  88                 ret = 1;
  89         local_irq_restore(flags);
  90         return ret;
  91 }
  92
  93 static int irqtime_account_si_update(void)
  94 {
  95         u64 *cpustat = kcpustat_this_cpu->cpustat;
  96         unsigned long flags;
  97         u64 latest_ns;
  98         int ret = 0;
  99
 100         local_irq_save(flags);
 101         latest_ns = this_cpu_read(cpu_softirq_time);
 102         if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
 103                 ret = 1;
 104         local_irq_restore(flags);
 105         return ret;
 106 }
 107
 108 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 109
 110 #define sched_clock_irqtime     (0)
 111
 112 #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
 113
 114 static inline void task_group_account_field(struct task_struct *p, int index,
 115                                             u64 tmp)
 116 {
 117 #ifdef CONFIG_CGROUP_CPUACCT
 118         struct kernel_cpustat *kcpustat;
 119         struct cpuacct *ca;
 120 #endif
 121         /*
 122          * Since all updates are sure to touch the root cgroup, we
 123          * get ourselves ahead and touch it first. If the root cgroup
 124          * is the only cgroup, then nothing else should be necessary.
 125          *
 126          */
 127         __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
 128
 129 #ifdef CONFIG_CGROUP_CPUACCT
 130         if (unlikely(!cpuacct_subsys.active))
 131                 return;
 132
 133         rcu_read_lock();
 134         ca = task_ca(p);
 135         while (ca && (ca != &root_cpuacct)) {
 136                 kcpustat = this_cpu_ptr(ca->cpustat);
 137                 kcpustat->cpustat[index] += tmp;
 138                 ca = parent_ca(ca);
 139         }
 140         rcu_read_unlock();
 141 #endif
 142 }
 143
 144 /*
 145  * Account user cpu time to a process.
 146  * @p: the process that the cpu time gets accounted to
 147  * @cputime: the cpu time spent in user space since the last update
 148  * @cputime_scaled: cputime scaled by cpu frequency
 149  */
 150 void account_user_time(struct task_struct *p, cputime_t cputime,
 151                        cputime_t cputime_scaled)
 152 {
 153         int index;
 154
 155         /* Add user time to process. */
 156         p->utime += cputime;
 157         p->utimescaled += cputime_scaled;
 158         account_group_user_time(p, cputime);
 159
 160         index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
 161
 162         /* Add user time to cpustat. */
 163         task_group_account_field(p, index, (__force u64) cputime);
 164
 165         /* Account for user time used */
 166         acct_update_integrals(p);
 167 }
 168
 169 /*
 170  * Account guest cpu time to a process.
 171  * @p: the process that the cpu time gets accounted to
 172  * @cputime: the cpu time spent in virtual machine since the last update
 173  * @cputime_scaled: cputime scaled by cpu frequency
 174  */
 175 static void account_guest_time(struct task_struct *p, cputime_t cputime,
 176                                cputime_t cputime_scaled)
 177 {
 178         u64 *cpustat = kcpustat_this_cpu->cpustat;
 179
 180         /* Add guest time to process. */
 181         p->utime += cputime;
 182         p->utimescaled += cputime_scaled;
 183         account_group_user_time(p, cputime);
 184         p->gtime += cputime;
 185
 186         /* Add guest time to cpustat. */
 187         if (TASK_NICE(p) > 0) {
 188                 cpustat[CPUTIME_NICE] += (__force u64) cputime;
 189                 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
 190         } else {
 191                 cpustat[CPUTIME_USER] += (__force u64) cputime;
 192                 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
 193         }
 194 }
 195
 196 /*
 197  * Account system cpu time to a process and desired cpustat field
 198  * @p: the process that the cpu time gets accounted to
 199  * @cputime: the cpu time spent in kernel space since the last update
 200  * @cputime_scaled: cputime scaled by cpu frequency
 201  * @target_cputime64: pointer to cpustat field that has to be updated
 202  */
 203 static inline
 204 void __account_system_time(struct task_struct *p, cputime_t cputime,
 205                         cputime_t cputime_scaled, int index)
 206 {
 207         /* Add system time to process. */
 208         p->stime += cputime;
 209         p->stimescaled += cputime_scaled;
 210         account_group_system_time(p, cputime);
 211
 212         /* Add system time to cpustat. */
 213         task_group_account_field(p, index, (__force u64) cputime);
 214
 215         /* Account for system time used */
 216         acct_update_integrals(p);
 217 }
 218
 219 /*
 220  * Account system cpu time to a process.
 221  * @p: the process that the cpu time gets accounted to
 222  * @hardirq_offset: the offset to subtract from hardirq_count()
 223  * @cputime: the cpu time spent in kernel space since the last update
 224  * @cputime_scaled: cputime scaled by cpu frequency
 225  */
 226 void account_system_time(struct task_struct *p, int hardirq_offset,
 227                          cputime_t cputime, cputime_t cputime_scaled)
 228 {
 229         int index;
 230
 231         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
 232                 account_guest_time(p, cputime, cputime_scaled);
 233                 return;
 234         }
 235
 236         if (hardirq_count() - hardirq_offset)
 237                 index = CPUTIME_IRQ;
 238         else if (in_serving_softirq())
 239                 index = CPUTIME_SOFTIRQ;
 240         else
 241                 index = CPUTIME_SYSTEM;
 242
 243         __account_system_time(p, cputime, cputime_scaled, index);
 244 }
 245
 246 /*
 247  * Account for involuntary wait time.
 248  * @cputime: the cpu time spent in involuntary wait
 249  */
 250 void account_steal_time(cputime_t cputime)
 251 {
 252         u64 *cpustat = kcpustat_this_cpu->cpustat;
 253
 254         cpustat[CPUTIME_STEAL] += (__force u64) cputime;
 255 }
 256
 257 /*
 258  * Account for idle time.
 259  * @cputime: the cpu time spent in idle wait
 260  */
 261 void account_idle_time(cputime_t cputime)
 262 {
 263         u64 *cpustat = kcpustat_this_cpu->cpustat;
 264         struct rq *rq = this_rq();
 265
 266         if (atomic_read(&rq->nr_iowait) > 0)
 267                 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
 268         else
 269                 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 270 }
 271
 272 static __always_inline bool steal_account_process_tick(void)
 273 {
 274 #ifdef CONFIG_PARAVIRT
 275         if (static_key_false(&paravirt_steal_enabled)) {
 276                 u64 steal, st = 0;
 277
 278                 steal = paravirt_steal_clock(smp_processor_id());
 279                 steal -= this_rq()->prev_steal_time;
 280
 281                 st = steal_ticks(steal);
 282                 this_rq()->prev_steal_time += st * TICK_NSEC;
 283
 284                 account_steal_time(st);
 285                 return st;
 286         }
 287 #endif
 288         return false;
 289 }
 290
 291 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 292
 293 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 294 /*
 295  * Account a tick to a process and cpustat
 296  * @p: the process that the cpu time gets accounted to
 297  * @user_tick: is the tick from userspace
 298  * @rq: the pointer to rq
 299  *
 300  * Tick demultiplexing follows the order
 301  * - pending hardirq update
 302  * - pending softirq update
 303  * - user_time
 304  * - idle_time
 305  * - system time
 306  *   - check for guest_time
 307  *   - else account as system_time
 308  *
 309  * Check for hardirq is done both for system and user time as there is
 310  * no timer going off while we are on hardirq and hence we may never get an
 311  * opportunity to update it solely in system time.
 312  * p->stime and friends are only updated on system time and not on irq
 313  * softirq as those do not count in task exec_runtime any more.
 314  */
 315 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 316                                                 struct rq *rq)
 317 {
 318         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 319         u64 *cpustat = kcpustat_this_cpu->cpustat;
 320
 321         if (steal_account_process_tick())
 322                 return;
 323
 324         if (irqtime_account_hi_update()) {
 325                 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
 326         } else if (irqtime_account_si_update()) {
 327                 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
 328         } else if (this_cpu_ksoftirqd() == p) {
 329                 /*
 330                  * ksoftirqd time do not get accounted in cpu_softirq_time.
 331                  * So, we have to handle it separately here.
 332                  * Also, p->stime needs to be updated for ksoftirqd.
 333                  */
 334                 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
 335                                         CPUTIME_SOFTIRQ);
 336         } else if (user_tick) {
 337                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 338         } else if (p == rq->idle) {
 339                 account_idle_time(cputime_one_jiffy);
 340         } else if (p->flags & PF_VCPU) { /* System time or guest time */
 341                 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
 342         } else {
 343                 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
 344                                         CPUTIME_SYSTEM);
 345         }
 346 }
 347
 348 static void irqtime_account_idle_ticks(int ticks)
 349 {
 350         int i;
 351         struct rq *rq = this_rq();
 352
 353         for (i = 0; i < ticks; i++)
 354                 irqtime_account_process_tick(current, 0, rq);
 355 }
 356 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
 357 static void irqtime_account_idle_ticks(int ticks) {}
 358 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 359                                                 struct rq *rq) {}
 360 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 361
 362 /*
 363  * Account a single tick of cpu time.
 364  * @p: the process that the cpu time gets accounted to
 365  * @user_tick: indicates if the tick is a user or a system tick
 366  */
 367 void account_process_tick(struct task_struct *p, int user_tick)
 368 {
 369         cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
 370         struct rq *rq = this_rq();
 371
 372         if (sched_clock_irqtime) {
 373                 irqtime_account_process_tick(p, user_tick, rq);
 374                 return;
 375         }
 376
 377         if (steal_account_process_tick())
 378                 return;
 379
 380         if (user_tick)
 381                 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
 382         else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
 383                 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
 384                                     one_jiffy_scaled);
 385         else
 386                 account_idle_time(cputime_one_jiffy);
 387 }
 388
 389 /*
 390  * Account multiple ticks of steal time.
 391  * @p: the process from which the cpu time has been stolen
 392  * @ticks: number of stolen ticks
 393  */
 394 void account_steal_ticks(unsigned long ticks)
 395 {
 396         account_steal_time(jiffies_to_cputime(ticks));
 397 }
 398
 399 /*
 400  * Account multiple ticks of idle time.
 401  * @ticks: number of stolen ticks
 402  */
 403 void account_idle_ticks(unsigned long ticks)
 404 {
 405
 406         if (sched_clock_irqtime) {
 407                 irqtime_account_idle_ticks(ticks);
 408                 return;
 409         }
 410
 411         account_idle_time(jiffies_to_cputime(ticks));
 412 }
 413
 414 #endif
 415
 416 /*
 417  * Use precise platform statistics if available:
 418  */
 419 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 420 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 421 {
 422         *ut = p->utime;
 423         *st = p->stime;
 424 }
 425
 426 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 427 {
 428         struct task_cputime cputime;
 429
 430         thread_group_cputime(p, &cputime);
 431
 432         *ut = cputime.utime;
 433         *st = cputime.stime;
 434 }
 435
 436 /*
 437  * Archs that account the whole time spent in the idle task
 438  * (outside irq) as idle time can rely on this and just implement
 439  * vtime_account_system() and vtime_account_idle(). Archs that
 440  * have other meaning of the idle time (s390 only includes the
 441  * time spent by the CPU when it's in low power mode) must override
 442  * vtime_account().
 443  */
 444 #ifndef __ARCH_HAS_VTIME_ACCOUNT
 445 void vtime_account(struct task_struct *tsk)
 446 {
 447         unsigned long flags;
 448
 449         local_irq_save(flags);
 450
 451         if (in_interrupt() || !is_idle_task(tsk))
 452                 vtime_account_system(tsk);
 453         else
 454                 vtime_account_idle(tsk);
 455
 456         local_irq_restore(flags);
 457 }
 458 EXPORT_SYMBOL_GPL(vtime_account);
 459 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 460
 461 #else
 462
 463 #ifndef nsecs_to_cputime
 464 # define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
 465 #endif
 466
 467 static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
 468 {
 469         u64 temp = (__force u64) rtime;
 470
 471         temp *= (__force u64) utime;
 472
 473         if (sizeof(cputime_t) == 4)
 474                 temp = div_u64(temp, (__force u32) total);
 475         else
 476                 temp = div64_u64(temp, (__force u64) total);
 477
 478         return (__force cputime_t) temp;
 479 }
 480
 481 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 482 {
 483         cputime_t rtime, utime = p->utime, total = utime + p->stime;
 484
 485         /*
 486          * Use CFS's precise accounting:
 487          */
 488         rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
 489
 490         if (total)
 491                 utime = scale_utime(utime, rtime, total);
 492         else
 493                 utime = rtime;
 494
 495         /*
 496          * Compare with previous values, to keep monotonicity:
 497          */
 498         p->prev_utime = max(p->prev_utime, utime);
 499         p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
 500
 501         *ut = p->prev_utime;
 502         *st = p->prev_stime;
 503 }
 504
 505 /*
 506  * Must be called with siglock held.
 507  */
 508 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 509 {
 510         struct signal_struct *sig = p->signal;
 511         struct task_cputime cputime;
 512         cputime_t rtime, utime, total;
 513
 514         thread_group_cputime(p, &cputime);
 515
 516         total = cputime.utime + cputime.stime;
 517         rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
 518
 519         if (total)
 520                 utime = scale_utime(cputime.utime, rtime, total);
 521         else
 522                 utime = rtime;
 523
 524         sig->prev_utime = max(sig->prev_utime, utime);
 525         sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
 526
 527         *ut = sig->prev_utime;
 528         *st = sig->prev_stime;
 529 }
 530 #endif